{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 243,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd \n",
    "import os \n",
    "import datetime"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 1. 导入数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 293,
   "metadata": {},
   "outputs": [],
   "source": [
    "#os.chdir('D:\\钉钉\\CHARLS\\数据\\2018')\n",
    "#os.listdir()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 294,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 文件路径\n",
    "file_path_Demographic_Background = r'D:\\钉钉\\CHARLS\\数据\\2018\\\\Demographic_Background.dta'\n",
    "file_path_Family_Information = r'D:\\钉钉\\CHARLS\\数据\\2018\\\\Family_Information.dta'\n",
    "file_path_Family_Transfer = r'D:\\钉钉\\CHARLS\\数据\\2018\\\\Family_Transfer.dta'\n",
    "file_path_Health_Status_and_Functioning = r'D:\\钉钉\\CHARLS\\数据\\2018\\\\Health_Status_and_Functioning.dta'\n",
    "file_path_Cognition = r'D:\\钉钉\\CHARLS\\数据\\2018\\\\Cognition.dta'\n",
    "file_path_Insider = r'D:\\钉钉\\CHARLS\\数据\\2018\\\\Insider.dta'\n",
    "file_path_Health_Care_and_Insurance = r'D:\\钉钉\\CHARLS\\数据\\2018\\\\Health_Care_and_Insurance.dta'\n",
    "file_path_Work_Retirement = r'D:\\钉钉\\CHARLS\\数据\\2018\\\\Work_Retirement.dta'\n",
    "file_path_Health_Care_and_Insurance = r'D:\\钉钉\\CHARLS\\数据\\2018\\\\Health_Care_and_Insurance.dta'\n",
    "file_path_Pension = r'D:\\钉钉\\CHARLS\\数据\\2018\\\\Pension.dta'\n",
    "file_path_Individual_Income = r'D:\\钉钉\\CHARLS\\数据\\2018\\\\Individual_Income.dta'\n",
    "file_path_Housing = r'D:\\钉钉\\CHARLS\\数据\\2018\\\\Housing.dta'\n",
    "file_path_Weights = r'D:\\钉钉\\CHARLS\\数据\\2018\\\\Weights.dta'\n",
    "file_path_Sample_Infor = r'D:\\钉钉\\CHARLS\\数据\\2018\\\\Sample_Infor.dta'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 2. 分析基本信息表 data_Demographic_Background"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2.1 列出预计会使用的变量"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- id related\n",
    "    - ID：个人样本编码； 用于匹配\n",
    "    - householdID：家庭id； 用于匹配\n",
    "    - communityID：社区id； 用于匹配\n",
    "\n",
    "\n",
    "- 性别：ba000_w2_3\n",
    "- 属相：ba001\n",
    "- 身份证或户口本上登记的出生年、月、日：ba004_w3_1, ba004_w3_2, ba004_w3_3 ;  \n",
    "- 真实的出生年、月、日：ba002_1, ba002_2，ba002_3 ；公历还是农历：ba003\n",
    "\n",
    "\n",
    "- 住址信息\n",
    "    -回访受访者\n",
    "    - bb000_w3：国内还是国外  \n",
    "    - bb000_w3_1：居住类型  \n",
    "    - bb000_w3_2：农村还是城市  \n",
    "    -新受访者\n",
    "    - bb001_w3：与上期一样、其他地方还是国外  \n",
    "    - bb001_w3_1：居住类型  \n",
    "    - bb001_w3_2：农村还是城市  \n",
    "- 出生地：bb001\n",
    "\n",
    "\n",
    "- 户口\n",
    "    - bc001_w3_2：户口类型  \n",
    "    - bc001_w3_3：户口所在地\n",
    "    - bc002_w3_1：现在的户口类型（变化后）；bc002_w3_2：变化原因\n",
    "    - bc002_w3_4：现在的户口所在地（变化后）；bc002_w3_5：变化原因\n",
    "\n",
    "\n",
    "- 教育\n",
    "    - bd001_w2_4：最高教育水平  \n",
    "    - bd006：几岁读完书    \n",
    "    - bd007_w4_1_s1-bd007_w4_1_s8：参加过的成人教育类型\n",
    "    -回访受访者\n",
    "    - bd012：是否参加过职业技术培训\n",
    "    - bd013：培训次数；bd014：累积培训时间；bd015_w4：是否获得培训证书\n",
    "    -新受访者\n",
    "    - bd012_w4：是否参加过职业技术培训\n",
    "    - bd013_w4：培训次数；bd014_w4：累积培训时间；bd017_w4：是否获得培训证书\n",
    "\n",
    "\n",
    "- 婚姻状态：be001\n",
    "- 民族：bg001_w4\n",
    "- 是否有宗教信仰：bg002_w4；信仰种类：bg003_w4\n",
    "- 是否是共产党员：bg004_w4；入党年份：bg004_w4_1\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2.2 整体了解一下数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 295,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_Demographic_Background = pd.read_stata(file_path_Demographic_Background, convert_categoricals=False) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 296,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(19816, 112)"
      ]
     },
     "execution_count": 296,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_Demographic_Background.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 297,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID</th>\n",
       "      <th>householdID</th>\n",
       "      <th>communityID</th>\n",
       "      <th>ba000_w2_3</th>\n",
       "      <th>ba001</th>\n",
       "      <th>ba004_w3</th>\n",
       "      <th>ba004_w3_1</th>\n",
       "      <th>ba004_w3_2</th>\n",
       "      <th>ba004_w3_3</th>\n",
       "      <th>ba005_w4</th>\n",
       "      <th>...</th>\n",
       "      <th>xrgender</th>\n",
       "      <th>zfrgender</th>\n",
       "      <th>zfrzodiac</th>\n",
       "      <th>zfrbirth</th>\n",
       "      <th>ziwtime</th>\n",
       "      <th>zbc004</th>\n",
       "      <th>zfredu</th>\n",
       "      <th>versionID</th>\n",
       "      <th>cyear18</th>\n",
       "      <th>cmonth18</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>094004113002</td>\n",
       "      <td>0940041130</td>\n",
       "      <td>0940041</td>\n",
       "      <td>2</td>\n",
       "      <td>6</td>\n",
       "      <td>1</td>\n",
       "      <td>1954.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>31.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>...</td>\n",
       "      <td>2</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2015年07月</td>\n",
       "      <td>2.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>20200914</td>\n",
       "      <td>2018</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>094004111002</td>\n",
       "      <td>0940041110</td>\n",
       "      <td>0940041</td>\n",
       "      <td>2</td>\n",
       "      <td>7</td>\n",
       "      <td>1</td>\n",
       "      <td>1954.0</td>\n",
       "      <td>6.0</td>\n",
       "      <td>15.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>...</td>\n",
       "      <td>2</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2015年08月</td>\n",
       "      <td>2.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>20200914</td>\n",
       "      <td>2018</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>094004111001</td>\n",
       "      <td>0940041110</td>\n",
       "      <td>0940041</td>\n",
       "      <td>2</td>\n",
       "      <td>7</td>\n",
       "      <td>1</td>\n",
       "      <td>1954.0</td>\n",
       "      <td>6.0</td>\n",
       "      <td>15.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>...</td>\n",
       "      <td>2</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2015年08月</td>\n",
       "      <td>2.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>20200914</td>\n",
       "      <td>2018</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>094004112001</td>\n",
       "      <td>0940041120</td>\n",
       "      <td>0940041</td>\n",
       "      <td>1</td>\n",
       "      <td>11</td>\n",
       "      <td>1</td>\n",
       "      <td>1946.0</td>\n",
       "      <td>10.0</td>\n",
       "      <td>9.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2015年08月</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>20200914</td>\n",
       "      <td>2018</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>094004118001</td>\n",
       "      <td>0940041180</td>\n",
       "      <td>0940041</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>1</td>\n",
       "      <td>1952.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>14.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2015年08月</td>\n",
       "      <td>2.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>20200914</td>\n",
       "      <td>2018</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 112 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "             ID householdID communityID  ba000_w2_3  ba001  ba004_w3  \\\n",
       "0  094004113002  0940041130     0940041           2      6         1   \n",
       "1  094004111002  0940041110     0940041           2      7         1   \n",
       "2  094004111001  0940041110     0940041           2      7         1   \n",
       "3  094004112001  0940041120     0940041           1     11         1   \n",
       "4  094004118001  0940041180     0940041           1      5         1   \n",
       "\n",
       "   ba004_w3_1  ba004_w3_2  ba004_w3_3  ba005_w4  ...  xrgender  zfrgender  \\\n",
       "0      1954.0         1.0        31.0       2.0  ...         2        1.0   \n",
       "1      1954.0         6.0        15.0       1.0  ...         2        1.0   \n",
       "2      1954.0         6.0        15.0       1.0  ...         2        1.0   \n",
       "3      1946.0        10.0         9.0       1.0  ...         1        1.0   \n",
       "4      1952.0         4.0        14.0       2.0  ...         1        1.0   \n",
       "\n",
       "   zfrzodiac  zfrbirth   ziwtime  zbc004  zfredu  versionID  cyear18  cmonth18  \n",
       "0        1.0       1.0  2015年07月     2.0     1.0   20200914     2018         7  \n",
       "1        1.0       1.0  2015年08月     2.0     1.0   20200914     2018         7  \n",
       "2        1.0       1.0  2015年08月     2.0     1.0   20200914     2018         7  \n",
       "3        1.0       1.0  2015年08月     1.0     1.0   20200914     2018         7  \n",
       "4        1.0       1.0  2015年08月     2.0     1.0   20200914     2018         7  \n",
       "\n",
       "[5 rows x 112 columns]"
      ]
     },
     "execution_count": 297,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_Demographic_Background.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 298,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ID               0\n",
       "householdID      0\n",
       "communityID      0\n",
       "ba000_w2_3       0\n",
       "ba001            0\n",
       "              ... \n",
       "zbc004         521\n",
       "zfredu         315\n",
       "versionID        0\n",
       "cyear18          0\n",
       "cmonth18         0\n",
       "Length: 112, dtype: int64"
      ]
     },
     "execution_count": 298,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# check 缺失值； 等具体问题分析再处理数据清洗问题\n",
    "data_Demographic_Background.isna().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 299,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "样本量（个人）和变量数量： (19816, 112)\n",
      "家庭数量：  11635\n",
      "社区数量：  449\n"
     ]
    }
   ],
   "source": [
    "# 数据量\n",
    "print('样本量（个人）和变量数量：', data_Demographic_Background.shape) \n",
    "# 家庭数量\n",
    "print('家庭数量： ', data_Demographic_Background['householdID'].unique().size)\n",
    "# 社区数量\n",
    "print('社区数量： ', data_Demographic_Background['communityID'].unique().size)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2.3 性别分布"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 318,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<AxesSubplot:title={'center':'性别分布'}>"
      ]
     },
     "execution_count": 318,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAEICAYAAACzliQjAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAAAX00lEQVR4nO3dcWxV9f3/8VdLtaVc4FouhdaqUITYslBowFnELNGb1kwCM10WZXOLMnAYNizTTTOEaryuYwtxU9IAGwszDhcEh8Eo5I4tVmEN37BaqLsC41pX2lxSR1tuS+/l9t7fH/y4sXBLb28vp718no/ExPu+5577Pp/7uS9OP733NC0SiUQEADBC+kg3AACwDqEPAAYh9AHAIIQ+ABiE0AcAgxD6uKH09PTo7NmzI93GkDU0NCgYDI50GzAAoY8byp49e/TQQw/FvG/MmDE6fvx4wvuuqKjQli1bYt539uxZXbx4UZLk8/m0e/fuuPfb19en8vJybdq0KeHegHgR+rihZGVlKSsrK3r7b3/7m/bt2ydJyszMVGZm5lWPueeee5SWltbvvwcffDDmvmM9XpKefPJJPfXUU5Iu/bTx2GOP6T//+U/Mba/8aszevXsVCoW0evXq+A4SGAZCHzeUjIwMjRkzJnr7vffe0wcffCBJuvnmm3XTTTdd9ZjMzEy98847ikQiikQi+uMf/xgNd5fLpe985zuSpLS0tJjPeezYMe3bt09PP/20JGn69On6/ve/r+XLlyscDl+1fXp6um6++WbZbDbZbDY98sgj6urq0tSpU6M1m82mcePGKTMzU4899tiwxgT4KkIfN4zOzk5JUjgc1urVq9XX16fPPvtMd911l0KhkKRLSymhUEg9PT3X3NflgM/OztbYsWMH3C4cDmvVqlV64oknNHv27Gjd5XLJ4/FE/yH4qq6uLp0/f17nz5/Xtm3bNGHCBJ07d05dXV3q6uqS3++X3+9Xd3e3AoGA3njjjaEOBTAgQh83jDvvvFNtbW1KT09XfX29Dhw4oCNHjujnP/+57Ha7Ojs7NWfOHE2cOFETJ06MPi4YDKqqqkpz587V3LlztX79+ugvVS8v9wzkN7/5jU6ePCmXy9WvPmnSJO3atUvbt2/XihUrdOHCheh948ePV2Zmprq7u/Xcc8/p+eef1/jx41VVVaW1a9cmeVSA/jJGugEgGTo6OtTb26u8vDxJ0rJly7R161b19vaqo6NDN910k+x2uxoaGjRt2rR+j/3DH/6gnp4eLV++XM8884yKioo0YcKEQZ/z3Xff1QsvvKAtW7YoEomovb293/1FRUV688039fjjj+vgwYP605/+pHvvvTd6/8qVK2Wz2fSTn/xE0qVlpmv9AwMkA6GPG8LRo0dVXFwcvb1mzRp5vV5VVlbGXMf/qsuPO336tEpKSjRnzpxBn6+1tVXLli3Txo0bVV1drccffzzmdpmZmfJ4PPrhD3+oW2+9NVp/8cUXtXPnTqWnp+uWW26RpOhPF5s3b5YkBQIB1dTU6Kc//emg/QDxYnkHN4ScnJx+n35JT0/Xj3/8Y61du1YXLlzot6YfDAaja/orV65UXl6eCgoK1N3drYqKChUUFKigoEDbt28f8Pny8/P16aefas2aNfrss8/U19en6dOna+/evdFfCO/du1e33Xabpk2bJrfbrWnTpikSiWj9+vV6+eWX9fTTT6uoqCi6hr927VqtXbs2eruysnLATwsBiSL0cUOYO3fuVZ9ymTlzpnbu3KlJkybJ4XCos7NT8+bN06RJk6Jn8+vXr1ddXZ3eeustZWRk6OOPP9ZHH32kRYsWKRgMKhwOX/URy8tuv/12SZfO5js7O9Xc3Ky5c+dG729tbVVBQUG/x3z66afaunWrdu/eraVLlw56XBkZ/DCO5CL0cUN75ZVX1NPTo46ODk2cOFGNjY06f/68Tp06JUkqKCjQnXfeqTNnzqioqEiFhYWaNm2axowZo6ysLPX29qq3t3fQ53nzzTc1b9686D8EknTmzBnddttt/babPXu2mpubtWTJkuQeKBAnTiNgvAsXLujll1/Wd7/73X61sWPH6tFHH9XixYuv+fhjx45p3bp1+stf/tKvfubMmavO9CVFl2wuLzld6csvv9Qnn3yipqamAb9dDCSKM33cUILBYPRyCMFgcMBgvXjxoi5evKj29nYtWbJEfX19Wr16tXp7e3XmzBmdPHlSEydO1PTp0zVnzhxdvHhRZ86c6bfGHolE9Pbbb+u+++7Ts88+q4qKCknSqVOn9I9//EMHDhzo99n9WD3E6j0tLU2LFy+Ww+GI+c1gYDgIfdxQAoGAAoGAJKmqqkqTJ0/W1KlTNXXqVGVlZemee+7R1KlTlZubq40bN6qiokI+n09ut1s2m01vv/22CgoKNGHCBC1atEjSpY90Zmdn69SpU7r77rujz/WjH/1ITz31lF5//XX94he/iNbr6ur05JNPqrKyUpWVlQP2+uCDD+qTTz65qvecnBydP39ef//73zV58uRkDxEMl8bfyIXJTp8+rSlTpmjcuHGSLp1t+3y+fmvx58+fV0NDg+6+++5+Z/o9PT0KBoOy2+1Wtw0kjNAHAIOwvAMABiH0AcAghD4AGGTUf06/tbV1pFu4JofDcdWFtkYj+ky+VOmVPpMrFfrMz88f8D7O9AHAIIQ+ABiE0AcAgxD6AGAQQh8ADELoA4BBCH0AMAihDwAGIfQBwCCj/hu5AEa/vhVJ+POP7xwa/j4wKM70AcAghD4AGITQBwCDEPoAYBBCHwAMQugDgEHi+shmR0eHNm3apJdeekmSVFtbq5aWFpWWlqqysnLYNQCANQY90/f7/dq8ebMCgYAkqb6+XuFwWC6XSz6fT21tbcOqAQCsM+iZfnp6uqqqqrRx40ZJUlNTk8rKyiRJJSUl8ng88nq9Cdfy8vL6PZ/b7Zbb7ZYk1dTUyOFwJOlQr4+MjIxR36NEn9dDqvRqRZ++JOyD8bTGoKGfnZ3d73YgEFBOTo4kyWazyev1Dqt2JafTKafTGb092v8WZSr8vUyJPq+HVOk1VfoMhUIp0WcqjGdS/0ZuVlaWgsGgJKm3t1fhcHhYNQCAdYYc+oWFhfJ4PJKk5uZm5ebmDqsGALDOkEN/wYIFqqur044dO3T48GGVlpYOqwYAsE5aJBKJDPVBfr9fjY2NKi4ult1uH3btWlpbW4fanqVSYX1Pos/rIVV6taLPZFxlc8o7hxjPJLnWmn5Cl1a22WxauHBh0moAAGvwjVwAMAihDwAGIfQBwCCEPgAYhNAHAIMQ+gBgkIQ+sonU43t4+B+THbPt3SR0AmAkcaYPAAYh9AHAIIQ+ABiE0AcAgxD6AGAQQh8ADELoA4BBCH0AMAihDwAGIfQBwCCEPgAYhNAHAIMQ+gBgEEIfAAxC6AOAQQh9ADAIoQ8ABiH0AcAghD4AGITQBwCDEPoAYBBCHwAMQugDgEEyhvoAv9+v1157TZ2dnSosLNTKlStVW1urlpYWlZaWqrKyUpLirgEArDPkM/0PP/xQixYtUk1NjS5cuKC9e/cqHA7L5XLJ5/Opra1N9fX1cdUAANYa8pn++PHj9d///lfd3d368ssvlZ2drbKyMklSSUmJPB6PvF5vXLW8vLwkHgoAYDBDDv277rpLR48e1fvvv69bb71VoVBIOTk5kiSbzSav16tAIBBXLRa32y232y1JqqmpkcPhSOjArJKRkTHqe5QkXxL2YcVxpsp4SqnTqxV9JmN+MZ7WGHLo79q1SytWrFB2drb27dunnTt36oEHHpAk9fb2KhwOKysrS8FgcNBaLE6nU06nM3q7vb19yAdlJYfDMep7TBYrjjOVxjNVek2VPkOhUEr0mQrjmZ+fP+B9Q17T7+7u1hdffKFwOKyTJ0/qW9/6ljwejySpublZubm5KiwsjKsGALDWkEP/4Ycf1tatW/WDH/xAfr9fDz30kOrq6rRjxw4dPnxYpaWlWrBgQVw1AIC10iKRSGS4O/H7/WpsbFRxcbHsdvuQaoNpbW0dbnvXVSr8qCdJfSuWDHsfY7a9m4ROri1VxlNKnV6t6DMZ82vKO4cYzyS51vLOkNf0Y7HZbFq4cGFCNQCAdfhGLgAYhNAHAIMQ+gBgEEIfAAxC6AOAQQh9ADAIoQ8ABiH0AcAghD4AGITQBwCDEPoAYBBCHwAMQugDgEEIfQAwCKEPAAYh9AHAIIQ+ABiE0AcAgxD6AGAQQh8ADELoA4BBCH0AMAihDwAGIfQBwCCEPgAYhNAHAIMQ+gBgEEIfAAxC6AOAQQh9ADAIoQ8ABskYzoN///vfa+7cuZo/f75qa2vV0tKi0tJSVVZWSlLcNQCANRI+0//3v/+tjo4OzZ8/X/X19QqHw3K5XPL5fGpra4u7BgCwTkJn+qFQSFu2bNG8efN05MgRNTU1qaysTJJUUlIij8cjr9cbVy0vL6/fvt1ut9xutySppqZGDocj4YOzQkZGxqjvUZJ8SdiHFceZKuMppU6vVvSZjPnFeFojodD/8MMPVVBQoKVLl+r999/X/v37df/990uSbDabvF6vAoGAcnJyBq1dyel0yul0Rm+3t7cn0qJlHA7HqO8xWaw4zlQaz1TpNVX6DIVCKdFnKoxnfn7+gPclFPper1dOp1N2u1333XefTpw4oWAwKEnq7e1VOBxWVlZWXDUAgHUSWtOfOnWqfL5LP9CdPn1aZ8+elcfjkSQ1NzcrNzdXhYWFcdUAANZJ6Ez//vvvV21trQ4dOqRQKKTq6mpt3LhR586dU0NDg1wulyRpw4YNcdUAANZIi0QikWTsyO/3q7GxUcXFxbLb7UOqXUtra2sy2rtuUmF9T5L6ViwZ9j7GbHs3CZ1cW6qMp5Q6vVrRZzLm15R3DjGeSZL0Nf1YbDabFi5cmFANAGANvpELAAYh9AHAIIQ+ABiE0AcAgxD6AGAQQh8ADELoA4BBCH0AMAihDwAGIfQBwCCEPgAYhNAHAIMQ+gBgEEIfAAxC6AOAQQh9ADAIoQ8ABiH0AcAghD4AGITQBwCDEPoAYBBCHwAMQugDgEEIfQAwCKEPAAYh9AHAIIQ+ABiE0AcAgxD6AGAQQh8ADJKR6AM7Ojr0yiuvaOPGjaqtrVVLS4tKS0tVWVkpSXHXAADWSfhM/4033lAwGFR9fb3C4bBcLpd8Pp/a2trirgEArJVQ6B8/flyZmZmy2+1qampSWVmZJKmkpEQejyfuGgDAWkNe3gmFQtq9e7eeeeYZ/frXv1YgEFBOTo4kyWazyev1xl2Lxe12y+12S5JqamrkcDgSOjCrZGRkjPoeJcmXhH1YcZypMp5S6vRqRZ/JmF+MpzWGHPp//etfVV5ernHjxkmSsrKyFAwGJUm9vb0Kh8Nx12JxOp1yOp3R2+3t7UNt0VIOh2PU95gsVhxnKo1nqvSaKn2GQqGU6DMVxjM/P3/A+4Yc+seOHdPx48e1f/9+ff7552pvb9ekSZM0a9YsNTc3Kz8/X5MmTZLH4xm0BgCw1pBD/8UXX4z+f3V1tX72s59pw4YNOnfunBoaGuRyuSQp7hoAwDppkUgkMtyd+P1+NTY2qri4WHa7fUi1wbS2tg63vesqFX7Uk6S+FUuGvY8x295NQifXlirjKaVOr1b0mYz5NeWdQ4xnkiR1eScWm82mhQsXJlQDAFiHb+QCgEEIfQAwCKEPAAYh9AHAIIQ+ABiE0AcAgxD6AGAQQh8ADELoA4BBCH0AMAihDwAGIfQBwCCEPgAYhNAHAIMQ+gBgEEIfAAxC6AOAQQh9ADAIoQ8ABiH0AcAghD4AGITQBwCDEPoAYBBCHwAMQugDgEEIfQAwCKEPAAYh9AHAIIQ+ABiE0AcAgxD6AGCQjEQe1NPTo1dffVXhcFiZmZmqqqrStm3b1NLSotLSUlVWVkqSamtr46oBAKyR0Jl+XV2dFi9erHXr1slut+vjjz9WOByWy+WSz+dTW1ub6uvr46oBAKyT0Jl+RUVF9P+7urpUV1enb37zm5KkkpISeTweeb1elZWVDVrLy8vrt2+32y232y1JqqmpkcPhSKRFy2RkZIz6HiXJl4R9WHGcqTKeUur0akWfyZhfjKc1Egr9y06cOKHu7m5NnjxZOTk5kiSbzSav16tAIBBX7UpOp1NOpzN6u729fTgtXncOh2PU95gsVhxnKo1nqvSaKn2GQqGU6DMVxjM/P3/A+xL+Ra7f79f27du1atUqZWVlKRgMSpJ6e3sVDofjrgEArJNQ6IdCIW3atEnLli3T5MmTVVhYKI/HI0lqbm5Wbm5u3DUAgHUSCv2DBw/K6/Vqz549qq6uViQSUV1dnXbs2KHDhw+rtLRUCxYsiKsGALBOQmv65eXlKi8v71ebP3++GhsbtXTpUmVnZ0uSNmzYEFcNAGCNYf0i96tsNpsWLlyYUA0AYA2+kQsABiH0AcAghD4AGITQBwCDEPoAYBBCHwAMQugDgEEIfQAwCKEPAAYh9AHAIIQ+ABiE0AcAgxD6AGAQQh8ADELoA4BBCH0AMAihDwAGIfQBwCCEPgAYhNAHAIMQ+gBgEEIfAAxC6AOAQQh9ADAIoQ8ABiH0AcAghD4AGITQBwCDEPoAYBBCHwAMkjEST1pbW6uWlhaVlpaqsrJyJFoAACNZfqZfX1+vcDgsl8sln8+ntrY2q1sAAGNZHvpNTU0qKyuTJJWUlMjj8VjdAgAYy/LlnUAgoJycHEmSzWaT1+vtd7/b7Zbb7ZYk1dTUKD8/3+oWhywVetR7/zfSHcQtJcbz/0uVXq97n0maX4zn9Wf5mX5WVpaCwaAkqbe3V+FwuN/9TqdTNTU1qqmpsbq1hDz33HMj3UJc6DP5UqVX+kyuVOlzIJaHfmFhYXRJp7m5Wbm5uVa3AADGsjz0FyxYoLq6Ou3YsUOHDx9WaWmp1S0AgLEsX9PPzs7Whg0b1NjYqKVLlyo7O9vqFpLK6XSOdAtxoc/kS5Ve6TO5UqXPgaRFIpHISDcBALAG38gFAIOMyDdyYa6Ojg5t2rRJL730Usz7e3p69OqrryocDiszM1NVVVVKS0vT6tWrNWXKFEnSE088odtvv93Ktke1wcb0wIEDOnTokCSpu7tbM2fO1PLlyxnTGGLNv4yMjEG3SaU5yvLONQx2uYjR9GYa7I0vxT4eKy+J4ff79dvf/lZdXV361a9+FXOb/fv3Ky8vT3PmzNG2bds0b9485eTk6NChQ/re9753Xfv7qsHGs6+vL+brbPUlRuIZ06/avn27vvGNbygtLc2yMY0nSKWRn59S7Pk3f/78QbcZiTmaKJZ3BhDP5SLKy8tVXV2t6upqFRUV6YEHHlBzc7PuvffeaN2KwPf7/dq8ebMCgcCA28Q6HqsviZGenq6qqiqNHTt2wG0qKio0Z84cSVJXV5cmTJigkydP6ujRo3r++edVW1urvr6+69pnPOMZ63UeiUuMxDOml/3vf/9TR0eHZsyYYemY1tXVafHixVq3bp3sdrsaGhqu2mY0zE8p9vyLZxur5+hwEPoDGMrlIkbqzXRZPG/8WMdj9SUxsrOz4/601okTJ9Td3a1Zs2ZpxowZeuGFF/TLX/5SfX19+te//nVd+4xnPGO9ziNxiZGhjOkHH3yg8vJySbJ0TOMJ0tEwP7/qq/Mvnm2snqPDQegP4MrLRXR2dg647Ui9mS6L540f63iGcoxW8vv92r59u1atWiVJuuOOO3TLLbdIuvTlvut9xhfPeMZ6nUfreEpSOBxWU1OTZs+eLcn6MZWuHaSjaX5eOf/i2WYkxjNRhP4ABrtcxGWj4c0Uj1jHE+8xWikUCmnTpk1atmyZJk+eLEl67bXX9PnnnyscDuvIkSO64447RrjL2K/zaBzPyzwej2bOnKm0tDRJ1o/pYEE6WuZnrPkXzzajcY4OhNAfQLyXixjpN1O8Yh3PSF8So6WlRW+99Va/2sGDB+X1erVnzx5VV1fr0KFD+va3v63XX39dzz77rGbNmhVdKhhJsV7nkR5PKfaYSlJDQ4OKioqit60c03iCdLTMzyvn365du1J2jg6ET+8MoKenRxs2bNDXvvY1NTQ0aM2aNfrnP/+pRx55pN92f/7znzVjxgx9/etflyR98cUX+t3vfqdIJKL58+fr0Ucftazny79UbGlp0UcffdSv1yuPx+VySdJVtVT/hnQyXWs8Y73OscaY8bz0KbedO3dGT4Bmz56tvr4+5ucIIfSvwe/3q7GxUcXFxbLb7SPdzrDFOp4b7RhHGuOZOOanNQh9ADAIa/oAYBBCHwAMQugDgEEIfQAwCKEPAAb5f+FvfVZOBABrAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import matplotlib as mpl\n",
    "# 设置字体\n",
    "mpl.rcParams['font.sans-serif'] = ['SimHei']\n",
    "# 设置风格\n",
    "plt.style.use('ggplot')\n",
    "a = Demographic_Background[~demo]['ba000_w2_3'].value_counts() # 1男 2女\n",
    "plt.bar(a.index, a.values)\n",
    "plt.title('性别分布')\n",
    "Demographic_Background['ba000_w2_3'].hist()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2.4 年龄分布"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 319,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "count    19409.000000\n",
      "mean         6.584832\n",
      "std          3.496086\n",
      "min          1.000000\n",
      "25%          3.000000\n",
      "50%          7.000000\n",
      "75%         10.000000\n",
      "max         12.000000\n",
      "Name: ba004_w3_2, dtype: float64\n",
      "count    19494.000000\n",
      "mean      1956.257053\n",
      "std         10.332225\n",
      "min       1900.000000\n",
      "25%       1949.000000\n",
      "50%       1957.000000\n",
      "75%       1965.000000\n",
      "max       2000.000000\n",
      "Name: ba004_w3_1, dtype: float64\n"
     ]
    }
   ],
   "source": [
    "# check data\n",
    "print(Demographic_Background['ba004_w3_2'].describe())  # 月份\n",
    "print(Demographic_Background['ba004_w3_1'].describe())  # 年份\n",
    "# -9 为缺失值"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 320,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "月缺失数据： 0\n",
      "年缺失数据：  0\n",
      "年缺失数据：  0\n"
     ]
    }
   ],
   "source": [
    "print('月缺失数据：', Demographic_Background[Demographic_Background['ba004_w3_2']==-9].shape[0])\n",
    "print('年缺失数据： ',Demographic_Background[Demographic_Background['ba004_w3_1']==-9].shape[0])\n",
    "print('年缺失数据： ',Demographic_Background[Demographic_Background['ba004_w3_1']==-1].shape[0])  # check codebook, 年份有 -1（不知道）， -9\n",
    "# 处理方法，剔除年缺失数据，月缺失数据补充月份为 6 "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 323,
   "metadata": {},
   "outputs": [
    {
     "ename": "KeyError",
     "evalue": "\"None of [Index(['cyear18', 'cmonth18'], dtype='object')] are in the [columns]\"",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-323-fb799bf10ca0>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m     13\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     14\u001b[0m \u001b[0mDemographic_Background\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'birthday'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mDemographic_Background\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'ba004_w3_1'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;34m'ba004_w3_2'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcombine_date\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0maxis\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 15\u001b[1;33m \u001b[0mDemographic_Background\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'visitday'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mDemographic_Background\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'cyear18'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;34m'cmonth18'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcombine_date\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0maxis\u001b[0m \u001b[1;33m=\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[1;32mD:\\pythonanaconda\\lib\\site-packages\\pandas\\core\\frame.py\u001b[0m in \u001b[0;36m__getitem__\u001b[1;34m(self, key)\u001b[0m\n\u001b[0;32m   3028\u001b[0m             \u001b[1;32mif\u001b[0m \u001b[0mis_iterator\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   3029\u001b[0m                 \u001b[0mkey\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 3030\u001b[1;33m             \u001b[0mindexer\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_get_listlike_indexer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mraise_missing\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   3031\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   3032\u001b[0m         \u001b[1;31m# take() does not accept boolean indexers\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\pythonanaconda\\lib\\site-packages\\pandas\\core\\indexing.py\u001b[0m in \u001b[0;36m_get_listlike_indexer\u001b[1;34m(self, key, axis, raise_missing)\u001b[0m\n\u001b[0;32m   1264\u001b[0m             \u001b[0mkeyarr\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mindexer\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnew_indexer\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0max\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_reindex_non_unique\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkeyarr\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1265\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1266\u001b[1;33m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_validate_read_indexer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkeyarr\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mindexer\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mraise_missing\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mraise_missing\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   1267\u001b[0m         \u001b[1;32mreturn\u001b[0m \u001b[0mkeyarr\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mindexer\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1268\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\pythonanaconda\\lib\\site-packages\\pandas\\core\\indexing.py\u001b[0m in \u001b[0;36m_validate_read_indexer\u001b[1;34m(self, key, indexer, axis, raise_missing)\u001b[0m\n\u001b[0;32m   1306\u001b[0m             \u001b[1;32mif\u001b[0m \u001b[0mmissing\u001b[0m \u001b[1;33m==\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mindexer\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1307\u001b[0m                 \u001b[0maxis_name\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mobj\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_get_axis_name\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0maxis\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1308\u001b[1;33m                 \u001b[1;32mraise\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34mf\"None of [{key}] are in the [{axis_name}]\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   1309\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1310\u001b[0m             \u001b[0max\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mobj\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_get_axis\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0maxis\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mKeyError\u001b[0m: \"None of [Index(['cyear18', 'cmonth18'], dtype='object')] are in the [columns]\""
     ]
    }
   ],
   "source": [
    "# 出生年、月：ba004_w3_1, ba004_w3_2 ;  调查年、月：令cyear18=2018, cmonth18=7  用于计算访问时的年龄\n",
    "# 1. 合并年月\n",
    "def combine_date(x):\n",
    "#     print(x[0])\n",
    "#     print(x[1])\n",
    "#     print(str(x[0]).split('.')[0]+str(x[1]).split('.')[0])\n",
    "    if (x[0] == -9) | (x[0] == -1):  # 剔除年份缺失数据\n",
    "        return None\n",
    "    else:\n",
    "        if x[1] == -9:  # 月份缺失数据填 6 月\n",
    "            return str(x[0]).split('.')[0]+'6'\n",
    "        return str(x[0]).split('.')[0]+str(x[1]).split('.')[0]\n",
    "\n",
    "Demographic_Background['birthday']=Demographic_Background[['ba004_w3_1','ba004_w3_2']].apply(combine_date,axis=1)\n",
    "Demographic_Background['visitday']=Demographic_Background[['cyear18','cmonth18']].apply(combine_date,axis =1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 325,
   "metadata": {},
   "outputs": [
    {
     "ename": "SyntaxError",
     "evalue": "closing parenthesis '}' does not match opening parenthesis '(' (<ipython-input-325-077f8337f816>, line 1)",
     "output_type": "error",
     "traceback": [
      "\u001b[1;36m  File \u001b[1;32m\"<ipython-input-325-077f8337f816>\"\u001b[1;36m, line \u001b[1;32m1\u001b[0m\n\u001b[1;33m    data = pd.merge(data, Demographic_Background[['ba004_w3_1','ba004_w3_2']], how='inner'}\u001b[0m\n\u001b[1;37m                                                                                          ^\u001b[0m\n\u001b[1;31mSyntaxError\u001b[0m\u001b[1;31m:\u001b[0m closing parenthesis '}' does not match opening parenthesis '('\n"
     ]
    }
   ],
   "source": [
    "data = pd.merge(data, Demographic_Background[['ba004_w3_1','ba004_w3_2']], how='inner'}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 314,
   "metadata": {},
   "outputs": [
    {
     "ename": "ValueError",
     "evalue": "time data 'nannan' does not match format '%Y%m' (match)",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[1;32mD:\\pythonanaconda\\lib\\site-packages\\pandas\\core\\tools\\datetimes.py\u001b[0m in \u001b[0;36m_convert_listlike_datetimes\u001b[1;34m(arg, format, name, tz, unit, errors, infer_datetime_format, dayfirst, yearfirst, exact)\u001b[0m\n\u001b[0;32m    455\u001b[0m             \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 456\u001b[1;33m                 \u001b[0mvalues\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtz\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mconversion\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdatetime_to_datetime64\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0marg\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    457\u001b[0m                 \u001b[0mdta\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mDatetimeArray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mtz_to_dtype\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtz\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mpandas\\_libs\\tslibs\\conversion.pyx\u001b[0m in \u001b[0;36mpandas._libs.tslibs.conversion.datetime_to_datetime64\u001b[1;34m()\u001b[0m\n",
      "\u001b[1;31mTypeError\u001b[0m: Unrecognized value type: <class 'str'>",
      "\nDuring handling of the above exception, another exception occurred:\n",
      "\u001b[1;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-314-1fae0fa8c502>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[1;31m# 2. 转换为 datetime 格式\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[0mDemographic_Background\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'birthday_dt'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mpd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mto_datetime\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mDemographic_Background\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'birthday'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mformat\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'%Y%m'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      3\u001b[0m \u001b[0mDemographic_Background\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'visitday_dt'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mpd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mto_datetime\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mDemographic_Background\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'visitday'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mformat\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'%Y%m'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\pythonanaconda\\lib\\site-packages\\pandas\\core\\tools\\datetimes.py\u001b[0m in \u001b[0;36mto_datetime\u001b[1;34m(arg, errors, dayfirst, yearfirst, utc, format, exact, unit, infer_datetime_format, origin, cache)\u001b[0m\n\u001b[0;32m    799\u001b[0m                 \u001b[0mresult\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mresult\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtz_localize\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtz\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    800\u001b[0m     \u001b[1;32melif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0marg\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mABCSeries\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 801\u001b[1;33m         \u001b[0mcache_array\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_maybe_cache\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0marg\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mformat\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcache\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mconvert_listlike\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    802\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mcache_array\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mempty\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    803\u001b[0m             \u001b[0mresult\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0marg\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmap\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcache_array\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\pythonanaconda\\lib\\site-packages\\pandas\\core\\tools\\datetimes.py\u001b[0m in \u001b[0;36m_maybe_cache\u001b[1;34m(arg, format, cache, convert_listlike)\u001b[0m\n\u001b[0;32m    176\u001b[0m         \u001b[0munique_dates\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0munique\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0marg\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    177\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0munique_dates\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m<\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0marg\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 178\u001b[1;33m             \u001b[0mcache_dates\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mconvert_listlike\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0munique_dates\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mformat\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    179\u001b[0m             \u001b[0mcache_array\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mSeries\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcache_dates\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0munique_dates\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    180\u001b[0m     \u001b[1;32mreturn\u001b[0m \u001b[0mcache_array\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\pythonanaconda\\lib\\site-packages\\pandas\\core\\tools\\datetimes.py\u001b[0m in \u001b[0;36m_convert_listlike_datetimes\u001b[1;34m(arg, format, name, tz, unit, errors, infer_datetime_format, dayfirst, yearfirst, exact)\u001b[0m\n\u001b[0;32m    458\u001b[0m                 \u001b[1;32mreturn\u001b[0m \u001b[0mDatetimeIndex\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_simple_new\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdta\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mname\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mname\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    459\u001b[0m             \u001b[1;32mexcept\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mValueError\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 460\u001b[1;33m                 \u001b[1;32mraise\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    461\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    462\u001b[0m     \u001b[1;32mif\u001b[0m \u001b[0mresult\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\pythonanaconda\\lib\\site-packages\\pandas\\core\\tools\\datetimes.py\u001b[0m in \u001b[0;36m_convert_listlike_datetimes\u001b[1;34m(arg, format, name, tz, unit, errors, infer_datetime_format, dayfirst, yearfirst, exact)\u001b[0m\n\u001b[0;32m    421\u001b[0m             \u001b[1;32mif\u001b[0m \u001b[0mresult\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    422\u001b[0m                 \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 423\u001b[1;33m                     result, timezones = array_strptime(\n\u001b[0m\u001b[0;32m    424\u001b[0m                         \u001b[0marg\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mformat\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mexact\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mexact\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0merrors\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0merrors\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    425\u001b[0m                     )\n",
      "\u001b[1;32mpandas\\_libs\\tslibs\\strptime.pyx\u001b[0m in \u001b[0;36mpandas._libs.tslibs.strptime.array_strptime\u001b[1;34m()\u001b[0m\n",
      "\u001b[1;31mValueError\u001b[0m: time data 'nannan' does not match format '%Y%m' (match)"
     ]
    }
   ],
   "source": [
    "# 2. 转换为 datetime 格式\n",
    "Demographic_Background['birthday_dt']=pd.to_datetime(Demographic_Background['birthday'],format='%Y%m')\n",
    "Demographic_Background['visitday_dt']=pd.to_datetime(Demographic_Background['visitday'],format='%Y%m')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 310,
   "metadata": {},
   "outputs": [
    {
     "ename": "KeyError",
     "evalue": "'visitday_dt'",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
      "\u001b[1;32mD:\\pythonanaconda\\lib\\site-packages\\pandas\\core\\indexes\\base.py\u001b[0m in \u001b[0;36mget_loc\u001b[1;34m(self, key, method, tolerance)\u001b[0m\n\u001b[0;32m   3079\u001b[0m             \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 3080\u001b[1;33m                 \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_loc\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcasted_key\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   3081\u001b[0m             \u001b[1;32mexcept\u001b[0m \u001b[0mKeyError\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0merr\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mpandas\\_libs\\index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[1;34m()\u001b[0m\n",
      "\u001b[1;32mpandas\\_libs\\index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[1;34m()\u001b[0m\n",
      "\u001b[1;32mpandas\\_libs\\hashtable_class_helper.pxi\u001b[0m in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[1;34m()\u001b[0m\n",
      "\u001b[1;32mpandas\\_libs\\hashtable_class_helper.pxi\u001b[0m in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[1;34m()\u001b[0m\n",
      "\u001b[1;31mKeyError\u001b[0m: 'visitday_dt'",
      "\nThe above exception was the direct cause of the following exception:\n",
      "\u001b[1;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-310-355ecc330b5c>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[1;31m# 计算年龄 用调查时间 - 出生日期\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[0mDemographic_Background\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'age'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mDemographic_Background\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'visitday_dt'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m-\u001b[0m\u001b[0mDemographic_Background\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'birthday_dt'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdt\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdays\u001b[0m\u001b[1;33m/\u001b[0m\u001b[1;36m365\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[1;32mD:\\pythonanaconda\\lib\\site-packages\\pandas\\core\\frame.py\u001b[0m in \u001b[0;36m__getitem__\u001b[1;34m(self, key)\u001b[0m\n\u001b[0;32m   3022\u001b[0m             \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnlevels\u001b[0m \u001b[1;33m>\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   3023\u001b[0m                 \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_getitem_multilevel\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 3024\u001b[1;33m             \u001b[0mindexer\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_loc\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   3025\u001b[0m             \u001b[1;32mif\u001b[0m \u001b[0mis_integer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mindexer\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   3026\u001b[0m                 \u001b[0mindexer\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0mindexer\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\pythonanaconda\\lib\\site-packages\\pandas\\core\\indexes\\base.py\u001b[0m in \u001b[0;36mget_loc\u001b[1;34m(self, key, method, tolerance)\u001b[0m\n\u001b[0;32m   3080\u001b[0m                 \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_loc\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcasted_key\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   3081\u001b[0m             \u001b[1;32mexcept\u001b[0m \u001b[0mKeyError\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0merr\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 3082\u001b[1;33m                 \u001b[1;32mraise\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0merr\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   3083\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   3084\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mtolerance\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mKeyError\u001b[0m: 'visitday_dt'"
     ]
    }
   ],
   "source": [
    "# 计算年龄 用调查时间 - 出生日期\n",
    "Demographic_Background['age']=(Demographic_Background['visitday_dt']-Demographic_Background['birthday_dt']).dt.days/365 "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 311,
   "metadata": {},
   "outputs": [
    {
     "ename": "KeyError",
     "evalue": "'age'",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
      "\u001b[1;32mD:\\pythonanaconda\\lib\\site-packages\\pandas\\core\\indexes\\base.py\u001b[0m in \u001b[0;36mget_loc\u001b[1;34m(self, key, method, tolerance)\u001b[0m\n\u001b[0;32m   3079\u001b[0m             \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 3080\u001b[1;33m                 \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_loc\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcasted_key\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   3081\u001b[0m             \u001b[1;32mexcept\u001b[0m \u001b[0mKeyError\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0merr\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mpandas\\_libs\\index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[1;34m()\u001b[0m\n",
      "\u001b[1;32mpandas\\_libs\\index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[1;34m()\u001b[0m\n",
      "\u001b[1;32mpandas\\_libs\\hashtable_class_helper.pxi\u001b[0m in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[1;34m()\u001b[0m\n",
      "\u001b[1;32mpandas\\_libs\\hashtable_class_helper.pxi\u001b[0m in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[1;34m()\u001b[0m\n",
      "\u001b[1;31mKeyError\u001b[0m: 'age'",
      "\nThe above exception was the direct cause of the following exception:\n",
      "\u001b[1;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-311-a27bbbe3e1e6>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mDemographic_Background\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'age'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mhist\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      2\u001b[0m \u001b[0mplt\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtitle\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'年龄分布'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\pythonanaconda\\lib\\site-packages\\pandas\\core\\frame.py\u001b[0m in \u001b[0;36m__getitem__\u001b[1;34m(self, key)\u001b[0m\n\u001b[0;32m   3022\u001b[0m             \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnlevels\u001b[0m \u001b[1;33m>\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   3023\u001b[0m                 \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_getitem_multilevel\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 3024\u001b[1;33m             \u001b[0mindexer\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_loc\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   3025\u001b[0m             \u001b[1;32mif\u001b[0m \u001b[0mis_integer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mindexer\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   3026\u001b[0m                 \u001b[0mindexer\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0mindexer\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\pythonanaconda\\lib\\site-packages\\pandas\\core\\indexes\\base.py\u001b[0m in \u001b[0;36mget_loc\u001b[1;34m(self, key, method, tolerance)\u001b[0m\n\u001b[0;32m   3080\u001b[0m                 \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_loc\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcasted_key\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   3081\u001b[0m             \u001b[1;32mexcept\u001b[0m \u001b[0mKeyError\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0merr\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 3082\u001b[1;33m                 \u001b[1;32mraise\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0merr\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   3083\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   3084\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mtolerance\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mKeyError\u001b[0m: 'age'"
     ]
    }
   ],
   "source": [
    "Demographic_Background['age'].hist()\n",
    "plt.title('年龄分布')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 258,
   "metadata": {},
   "outputs": [
    {
     "ename": "KeyError",
     "evalue": "'age'",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
      "\u001b[1;32mD:\\pythonanaconda\\lib\\site-packages\\pandas\\core\\indexes\\base.py\u001b[0m in \u001b[0;36mget_loc\u001b[1;34m(self, key, method, tolerance)\u001b[0m\n\u001b[0;32m   3079\u001b[0m             \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 3080\u001b[1;33m                 \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_loc\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcasted_key\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   3081\u001b[0m             \u001b[1;32mexcept\u001b[0m \u001b[0mKeyError\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0merr\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mpandas\\_libs\\index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[1;34m()\u001b[0m\n",
      "\u001b[1;32mpandas\\_libs\\index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[1;34m()\u001b[0m\n",
      "\u001b[1;32mpandas\\_libs\\hashtable_class_helper.pxi\u001b[0m in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[1;34m()\u001b[0m\n",
      "\u001b[1;32mpandas\\_libs\\hashtable_class_helper.pxi\u001b[0m in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[1;34m()\u001b[0m\n",
      "\u001b[1;31mKeyError\u001b[0m: 'age'",
      "\nThe above exception was the direct cause of the following exception:\n",
      "\u001b[1;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-258-f4b5808a36a7>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[1;31m# 年龄分段\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      2\u001b[0m \u001b[0mage_cut\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[1;33m-\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m16\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m23\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;36m46\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;36m60\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;36m120\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 3\u001b[1;33m \u001b[0mage_seg\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcut\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mDemographic_Background\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'age'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mbins\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mage_cut\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mlabels\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'0-16'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;34m'16-22'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;34m'23-45'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;34m'46-60'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;34m'60+'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      4\u001b[0m \u001b[0mDemographic_Background\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'age_segment'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m=\u001b[0m \u001b[0mage_seg\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      5\u001b[0m \u001b[0ma\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mDemographic_Background\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'age_segment'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvalue_counts\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\pythonanaconda\\lib\\site-packages\\pandas\\core\\frame.py\u001b[0m in \u001b[0;36m__getitem__\u001b[1;34m(self, key)\u001b[0m\n\u001b[0;32m   3022\u001b[0m             \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnlevels\u001b[0m \u001b[1;33m>\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   3023\u001b[0m                 \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_getitem_multilevel\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 3024\u001b[1;33m             \u001b[0mindexer\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_loc\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   3025\u001b[0m             \u001b[1;32mif\u001b[0m \u001b[0mis_integer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mindexer\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   3026\u001b[0m                 \u001b[0mindexer\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0mindexer\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\pythonanaconda\\lib\\site-packages\\pandas\\core\\indexes\\base.py\u001b[0m in \u001b[0;36mget_loc\u001b[1;34m(self, key, method, tolerance)\u001b[0m\n\u001b[0;32m   3080\u001b[0m                 \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_loc\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcasted_key\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   3081\u001b[0m             \u001b[1;32mexcept\u001b[0m \u001b[0mKeyError\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0merr\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 3082\u001b[1;33m                 \u001b[1;32mraise\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0merr\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   3083\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   3084\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mtolerance\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mKeyError\u001b[0m: 'age'"
     ]
    }
   ],
   "source": [
    "# 年龄分段\n",
    "age_cut = [-1, 16, 23,46,60,120]\n",
    "age_seg = pd.cut(Demographic_Background['age'],bins = age_cut,labels=['0-16','16-22','23-45','46-60','60+'])\n",
    "Demographic_Background['age_segment']= age_seg.values\n",
    "a = Demographic_Background['age_segment'].value_counts()\n",
    "a"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 259,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAA6sAAAJOCAYAAABRHJEAAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAAAi80lEQVR4nO3df3TedX3//0cgtKUETiihrrVqLUeO0I2OHjrWwnSbWYHBZK7IQY6c/UAHetxBcHL0ALZ61mN0G2OIqyhjpzC3Ok5B3D5HYJF5rKN2OlbKolHEEhaoYUV+mJYmpFe+f+DytSQlAfrjmfZ2+6u5rleuPJv3eefVe6/3daVpeHh4OAAAAFDIIft7AAAAAHgxsQoAAEA5YhUAAIByxCoAAADliFUAAADKEasAsJc9++yz2Rdvvr9x48YMDg7u9a8DAPuCWAWAMQwPD+ekk07KJz/5yZf1eSeffHLuv//+XW7r6OjIeeedN+7nPvHEE3n++eeTJH19fVm7du2Ev+7OnTuzdOnSXHvttS9rXgCoSqwCwBhuvvnmfP/738/v/d7vjbt2586d+drXvpYkmTJlSo466qhd7l+/fn2+973vZWho6CUf55JLLsn73//+JMn27dtz0UUX5eGHHx5z7Yufqb3zzjszNDSUD3zgA+POCwCTQdPwvrguCQAmkb6+vvzSL/1SBgYGMmvWrCQZubx2ypQpGR4ezrZt29Lc3JxHHnkkg4ODmTNnTp544oksWbIkzzzzTAYGBvLDH/4wjz76aObPn5+lS5fm1FNPzZVXXjnm13zwwQezcOHCbNy4MfPnz0+SXHrppenu7s69996bQw7Z9f+Xm5qacthhh2XKlCkj8zUajUybNm2XdcPDwxkaGsr555+fW2+9dY9+nwBgbxKrAPBzGo1GzjzzzGzZsiX/8R//kcMPPzxJ8sEPfjBJct111436nOHh4cyZMyePPfZYTjvttHzxi1/M3LlzkyQf+tCHsn379qxYsSILFy7MLbfckre97W2jvuZb3vKWzJ8/PzfeeOPI7U8++WTmz5+f888/P9dff/0un/PTn/40U6ZMyZQpU7JmzZr8yZ/8STZv3pwjjjgiSUbFLQBMNs37ewAAqOSSSy7J+vXr861vfWskVMezYsWKbN++PTfccEN+/OMfZ/Xq1Wlpacm5556bv/3bv82GDRsyODiY1atXZ9myZfnLv/zLXHzxxSOf/xd/8Rd56KGH8uUvf3mXxz3mmGNy22235ayzzspzzz2X66+/fmSmI488MknS39+fj3zkI/noRz+aI488MpdddlmamprGjGoAmEw8swoAPzM0NJR/+qd/ylFHHZXVq1fnP//zP0cus33iiSeSvBCQzz//fPr7+3PTTTfld3/3d/N3f/d3mTp1apqamvK///u/OeaYYzIwMJDPfe5zOe+88/LGN74xn/70p7Nhw4Z0dnbmuuuuy5e+9KUceeSR+cpXvpJ3vvOdufHGG3P22Wenqalp1Fz//u//nj/8wz/M0UcfnVtuuSWnnXbayH0XXnhhHnjggWzcuDGHHXZYrrzyyjQ1NeVTn/rUvvmmAcBeIlYBYAy//du/nfPPPz9/8Ad/kOSlLwP+53/+5/z4xz/O2WefnYULF6a3tzdXXHFF3vWud+VXf/VXkyQXX3xxVq5cOfIa2CR5/PHHc/zxx2flypX5q7/6q/T09Iw5y9SpU9Pd3Z33vOc9uemmm0YuMf74xz+eFStW5JBDDhl5xvXnX1ubJAMDA+no6MiHPvShV/stAYB9ygtaAGAMhx566ITXPvroo3n44YezYcOGzJ8/P83NzVm9enWmT5+eX//1X89v/dZv5dFHH81FF12Us846K0899VSSZPbs2fnud7+byy67LN///vezc+fOvPGNb8ydd96Z4eHhDA8P584778zrXve6zJ07N52dnZk7d26Gh4fzsY99LH/2Z3+WD37wgznhhBPS39+f/v7+XHHFFbniiitGPl62bFmmTp26t75NALDXeM0qAPzMzp07MzQ0lObm5gm9QdHzzz+fRqORQw45JFOmTMkdd9yRH/3oR/m3f/u3HH744Xnzm9+c22+/fSR83//+9+fYY4/N0UcfPfIYr3/965O88OzpU089lZ6envzyL//yyP2PP/545syZs8vX/e53v5vPf/7zWbt2bY466qj867/+60vO2dxsuwdg8rF7AcDP3HXXXbnkkksyZcqUHHbYYeno6EhHR0eS//81q3fdddfI+qGhoSxatChvfetb09vbm29+85u59dZbc8011+TQQw/N1KlTc8cdd2Tx4sX5yU9+kq997Wvp7u7e7df/4he/mJNPPnkkYJPksccey+te97pd1s2fPz89PT2ZOnVqvv71r+/B7wAA1CFWAeBnzj777PT29o5530u9ZvVv/uZv8tRTT+Waa67J6aefnr//+7/Pr/zKr4zc/5a3vCVtbW35xCc+scuzqj/vwQcfzNVXX50vfelLu9z+2GOPjXpmNcnIpb1DQ0NjPt6TTz6ZBx54IF1dXTn77LPHXAMAlYlVAJiAnTt37va+oaGhPPvss3nggQdy3nnnpbe3N88880yS5IILLsgRRxyRd77znTn11FNHfe7w8HDWrl2b97znPfnwhz+cM844I0nywx/+ML29vbnnnnte8p19n3/++V0+HhwcTFNTU5qamnLOOefk1FNPzZlnnvlK/soAsF+JVQCYgOeee263b7r0G7/xG5kzZ07mzJmT2bNnp62tLa997WtH7v+d3/md3HLLLbnoooty//3357DDDhu579JLL80dd9yRG264Ie9+97tHbl+3bl06OjqybNmyLFu2bLdznXnmmVm6dOnIxwMDA2lqasqMGTPy05/+9GW9URQAVOJX1wDAPrJjx45MmzZtl9u2b9+ewcHBtLa27p+hAKAosQoAAEA5fs8qAAAA5YhVAAAAyhGrAAAAlLPf3g348ccf319fmleora0tW7du3d9j8Co5jgcGx/HA4DgeGBzHA4PjeGBwHCef2bNn7/Y+z6wCAABQjlgFAACgHLEKAABAOWIVAACAcsQqAAAA5YhVAAAAyhGrAAAAlCNWAQAAKEesAgAAUI5YBQAAoByxCgAAQDliFQAAgHLEKgAAAOWIVQAAAMoRqwAAAJQjVgEAAChHrAIAAFCOWAUAAKAcsQoAAEA5YhUAAIByxCoAAADliFUAAADKmVCsPv300/nYxz72kmtWrVqVq666KmvXrt0jgwEAAHDwGjdW+/v789nPfjYDAwO7XbNhw4Y0Go2sXLkyfX192bJlyx4dEgAAgINL83gLDjnkkFx++eX59Kc/vds1XV1dWbx4cZJkwYIF6e7uzqxZs3ZZ09nZmc7OziRJR0dH2traXs3ce13fO5bs7xHK6dvfA7BHOI4HBsdxV6+54779PcIr0tzcXH4/fDH742jOxwOD43hgcBx3NVn3x/8zbqxOnz593AcZGBjIjBkzkiQtLS3ZvHnzqDXt7e1pb28f+Xjr1q0vZ04A2K3Juqe0tbVN2tkBqG8y7DGzZ8/e7X175A2Wpk2blsHBwSTJjh070mg09sTDAgAAcJDaI7E6b968dHd3J0l6enoyc+bMPfGwAAAAHKRedqz29vZmzZo1u9y2aNGirFu3LqtXr8769euzcOHCPTYgAAAAB5+m4eHh4T3xQP39/dm0aVNOPPHEtLa2jrv+8ccf3xNfdq/Z+d637+8RAJigQ7/wlf09wisyGV+zan8EmDwmw/74Uq9ZHfcNliaqpaUlS5Z4h0AAAABevT3ymlUAAADYk8QqAAAA5YhVAAAAyhGrAAAAlCNWAQAAKEesAgAAUI5YBQAAoByxCgAAQDliFQAAgHLEKgAAAOWIVQAAAMoRqwAAAJQjVgEAAChHrAIAAFCOWAUAAKAcsQoAAEA5YhUAAIByxCoAAADliFUAAADKEasAAACUI1YBAAAoR6wCAABQjlgFAACgHLEKAABAOWIVAACAcsQqAAAA5YhVAAAAyhGrAAAAlCNWAQAAKEesAgAAUI5YBQAAoByxCgAAQDliFQAAgHLEKgAAAOWIVQAAAMoRqwAAAJQjVgEAAChHrAIAAFCOWAUAAKAcsQoAAEA5YhUAAIByxCoAAADliFUAAADKEasAAACUI1YBAAAoR6wCAABQjlgFAACgHLEKAABAOWIVAACAcsQqAAAA5YhVAAAAyhGrAAAAlCNWAQAAKEesAgAAUI5YBQAAoByxCgAAQDliFQAAgHLEKgAAAOWIVQAAAMoRqwAAAJQjVgEAAChHrAIAAFCOWAUAAKAcsQoAAEA5YhUAAIByxCoAAADliFUAAADKEasAAACUI1YBAAAoR6wCAABQjlgFAACgHLEKAABAOWIVAACAcsQqAAAA5YhVAAAAyhGrAAAAlCNWAQAAKEesAgAAUI5YBQAAoByxCgAAQDliFQAAgHLEKgAAAOWIVQAAAMoRqwAAAJQjVgEAAChHrAIAAFCOWAUAAKAcsQoAAEA5YhUAAIByxCoAAADliFUAAADKEasAAACUI1YBAAAoR6wCAABQTvNEFq1atSq9vb1ZuHBhli1bNur+/v7+fOYzn8kzzzyTefPm5Y//+I/3+KAAAAAcPMZ9ZnXDhg1pNBpZuXJl+vr6smXLllFrvvGNb+T0009PR0dHnnvuuTz88MN7ZVgAAAAODuPGaldXVxYvXpwkWbBgQbq7u0etOfLII/M///M/2bZtW5588skcc8wxe35SAAAADhrjXgY8MDCQGTNmJElaWlqyefPmUWve/OY35/77789Xv/rVvPa1r01LS8uoNZ2dnens7EySdHR0pK2t7dXOvlf17e8BAJiw6nvK7jQ3N0+62e2PAJPHZNtjXmzcWJ02bVoGBweTJDt27Eij0Ri15rbbbst73/veTJ8+Pf/yL/+Sr3/962lvb99lTXt7+y63bd269dXODgBJJu+e0tbWNmlnB6C+ybDHzJ49e7f3jXsZ8Lx580Yu/e3p6cnMmTNHrdm2bVseffTRNBqNPPTQQ69iVAAAAJhArC5atCjr1q3L6tWrs379+syZMydr1qzZZc073vGOfP7zn8/v//7vp7+/P6effvpeGxgAAIADX9Pw8PDweIv6+/uzadOmnHjiiWltbd0jX/jxxx/fI4+zt+x879v39wgATNChX/jK/h7hFZmMlwHbHwEmj8mwP77UZcAT+j2rLS0tWbJkyR4bCAAAAF7KuJcBAwAAwL4mVgEAAChHrAIAAFCOWAUAAKAcsQoAAEA5YhUAAIByxCoAAADliFUAAADKEasAAACUI1YBAAAoR6wCAABQjlgFAACgHLEKAABAOWIVAACAcsQqAAAA5YhVAAAAyhGrAAAAlCNWAQAAKEesAgAAUI5YBQAAoByxCgAAQDliFQAAgHLEKgAAAOWIVQAAAMoRqwAAAJQjVgEAAChHrAIAAFCOWAUAAKAcsQoAAEA5YhUAAIByxCoAAADliFUAAADKEasAAACUI1YBAAAoR6wCAABQjlgFAACgHLEKAABAOWIVAACAcsQqAAAA5YhVAAAAyhGrAAAAlCNWAQAAKEesAgAAUI5YBQAAoByxCgAAQDliFQAAgHLEKgAAAOWIVQAAAMoRqwAAAJQjVgEAAChHrAIAAFCOWAUAAKAcsQoAAEA5YhUAAIByxCoAAADliFUAAADKEasAAACUI1YBAAAoR6wCAABQjlgFAACgHLEKAABAOWIVAACAcsQqAAAA5YhVAAAAyhGrAAAAlCNWAQAAKEesAgAAUI5YBQAAoByxCgAAQDliFQAAgHLEKgAAAOWIVQAAAMoRqwAAAJQjVgEAAChHrAIAAFCOWAUAAKAcsQoAAEA5YhUAAIByxCoAAADliFUAAADKEasAAACUI1YBAAAoR6wCAABQjlgFAACgHLEKAABAOWIVAACAcsQqAAAA5YhVAAAAyhGrAAAAlCNWAQAAKEesAgAAUI5YBQAAoByxCgAAQDliFQAAgHLEKgAAAOVMKFZXrVqVq666KmvXrn3JdTfddFO+853v7JHBAAAAOHiNG6sbNmxIo9HIypUr09fXly1btoy57nvf+16efvrpnHLKKXt8SAAAAA4uzeMt6OrqyuLFi5MkCxYsSHd3d2bNmrXLmqGhodx44405+eST8+1vfzuLFi0a9TidnZ3p7OxMknR0dKStrW1PzL/X9O3vAQCYsOp7yu40NzdPutntjwCTx2TbY15s3FgdGBjIjBkzkiQtLS3ZvHnzqDXf+MY3MmfOnJx77rn56le/mq1bt+ass87aZU17e3va29tHPt66deurnR0AkkzePaWtrW3Szg5AfZNhj5k9e/Zu7xv3MuBp06ZlcHAwSbJjx440Go1RazZv3pz29va0trbm137t19LV1fUqxgUAAOBgN26szps3L93d3UmSnp6ezJw5c9SaX/iFX0hf3wsXBv3oRz+a9E83AwAAsH+NG6uLFi3KunXrsnr16qxfvz5z5szJmjVrdlnzm7/5m+nq6sry5ctz99135+1vf/teGxgAAIAD37ivWZ0+fXqWL1+eTZs25dxzz01ra2vmzp27y5rDDz88V1xxxd6aEQAAgIPMuLGavPDGSkuWLNnbswAAAECSCVwGDAAAAPuaWAUAAKAcsQoAAEA5YhUAAIByxCoAAADliFUAAADKEasAAACUI1YBAAAoR6wCAABQjlgFAACgHLEKAABAOWIVAACAcsQqAAAA5YhVAAAAyhGrAAAAlCNWAQAAKEesAgAAUI5YBQAAoByxCgAAQDliFQAAgHLEKgAAAOWIVQAAAMoRqwAAAJQjVgEAAChHrAIAAFCOWAUAAKAcsQoAAEA5YhUAAIByxCoAAADliFUAAADKEasAAACUI1YBAAAoR6wCAABQjlgFAACgHLEKAABAOWIVAACAcsQqAAAA5YhVAAAAyhGrAAAAlCNWAQAAKEesAgAAUI5YBQAAoByxCgAAQDliFQAAgHLEKgAAAOWIVQAAAMoRqwAAAJQjVgEAAChHrAIAAFCOWAUAAKAcsQoAAEA5YhUAAIByxCoAAADliFUAAADKEasAAACUI1YBAAAoR6wCAABQjlgFAACgHLEKAABAOWIVAACAcsQqAAAA5YhVAAAAyhGrAAAAlCNWAQAAKEesAgAAUI5YBQAAoByxCgAAQDliFQAAgHLEKgAAAOWIVQAAAMoRqwAAAJQjVgEAAChHrAIAAFCOWAUAAKAcsQoAAEA5YhUAAIByxCoAAADliFUAAADKEasAAACUI1YBAAAoR6wCAABQjlgFAACgHLEKAABAOWIVAACAcsQqAAAA5YhVAAAAyhGrAAAAlCNWAQAAKEesAgAAUI5YBQAAoByxCgAAQDliFQAAgHLEKgAAAOWIVQAAAMqZUKyuWrUqV111VdauXfuS655++ulceeWVe2QwAAAADl7jxuqGDRvSaDSycuXK9PX1ZcuWLbtde+utt2ZwcHCPDggAAMDBZ9xY7erqyuLFi5MkCxYsSHd395jr/vu//ztTp05Na2vrHh0QAACAg0/zeAsGBgYyY8aMJElLS0s2b948as3Q0FDWrl2bP/3TP82f//mfj/k4nZ2d6ezsTJJ0dHSkra3t1cy91/Xt7wEAmLDqe8ruNDc3T7rZ7Y8Ak8dk22NebNxYnTZt2silvTt27Eij0Ri15stf/nKWLl2aI444YreP097envb29pGPt27d+krmBYBRJuue0tbWNmlnB6C+ybDHzJ49e7f3jXsZ8Lx580Yu/e3p6cnMmTNHrXnwwQdz9913Z8WKFXnkkUfyuc997lWMCwAAwMFu3GdWFy1alOXLl+epp57Kxo0bc9lll2XNmjW54IILRtZ8/OMfH/nzihUrcumll+6daQEAADgojBur06dPz/Lly7Np06ace+65aW1tzdy5c3e7fsWKFXtwPAAAAA5G48Zq8sIbKy1ZsmRvzwIAAABJJvCaVQAAANjXxCoAAADliFUAAADKEasAAACUI1YBAAAoR6wCAABQjlgFAACgHLEKAABAOWIVAACAcsQqAAAA5YhVAAAAyhGrAAAAlCNWAQAAKEesAgAAUI5YBQAAoByxCgAAQDliFQAAgHLEKgAAAOWIVQAAAMoRqwAAAJQjVgEAAChHrAIAAFCOWAUAAKAcsQoAAEA5YhUAAIByxCoAAADliFUAAADKEasAAACUI1YBAAAoR6wCAABQjlgFAACgHLEKAABAOWIVAACAcsQqAAAA5YhVAAAAyhGrAAAAlCNWAQAAKEesAgAAUI5YBQAAoByxCgAAQDliFQAAgHLEKgAAAOWIVQAAAMoRqwAAAJQjVgEAAChHrAIAAFCOWAUAAKAcsQoAAEA5YhUAAIByxCoAAADliFUAAADKEasAAACUI1YBAAAoR6wCAABQjlgFAACgHLEKAABAOWIVAACAcsQqAAAA5YhVAAAAyhGrAAAAlCNWAQAAKEesAgAAUI5YBQAAoByxCgAAQDliFQAAgHLEKgAAAOWIVQAAAMoRqwAAAJQjVgEAAChHrAIAAFCOWAUAAKAcsQoAAEA5YhUAAIByxCoAAADliFUAAADKEasAAACUI1YBAAAoR6wCAABQjlgFAACgHLEKAABAOWIVAACAcsQqAAAA5YhVAAAAyhGrAAAAlCNWAQAAKEesAgAAUI5YBQAAoByxCgAAQDliFQAAgHLEKgAAAOWIVQAAAMoRqwAAAJQjVgEAAChHrAIAAFCOWAUAAKCc5oksWrVqVXp7e7Nw4cIsW7Zs1P3bt2/Pddddl0ajkalTp+byyy9Pc/OEHhoAAABGGfeZ1Q0bNqTRaGTlypXp6+vLli1bRq1Zt25dzjnnnFx99dVpbW3Nxo0b98asAAAAHCTGffqzq6srixcvTpIsWLAg3d3dmTVr1i5rzjjjjJE/P/vssznqqKNGPU5nZ2c6OzuTJB0dHWlra3tVg+9tfft7AAAmrPqesjvNzc2Tbnb7I8DkMdn2mBcbN1YHBgYyY8aMJElLS0s2b96827U/+MEPsm3bthx//PGj7mtvb097e/vIx1u3bn0l8wLAKJN1T2lra5u0swNQ32TYY2bPnr3b+8a9DHjatGkZHBxMkuzYsSONRmPMdf39/bn55pvzvve97xWOCQAAAC8YN1bnzZuX7u7uJElPT09mzpw5as3Q0FCuvfbaXHjhhTn22GP3/JQAAAAcVMaN1UWLFmXdunVZvXp11q9fnzlz5mTNmjW7rLn33nuzefPm3H777VmxYkXuu+++vTYwAAAAB75xX7M6ffr0LF++PJs2bcq5556b1tbWzJ07d5c1S5cuzdKlS/fWjAAAABxkJvTLUFtaWrJkyZK9PQsAAAAkmcBlwAAAALCviVUAAADKEasAAACUI1YBAAAoR6wCAABQjlgFAACgHLEKAABAOWIVAACAcsQqAAAA5YhVAAAAyhGrAAAAlCNWAQAAKEesAgAAUI5YBQAAoByxCgAAQDliFQAAgHLEKgAAAOWIVQAAAMoRqwAAAJQjVgEAAChHrAIAAFCOWAUAAKAcsQoAAEA5YhUAAIByxCoAAADliFUAAADKEasAAACUI1YBAAAoR6wCAABQjlgFAACgHLEKAABAOWIVAACAcsQqAAAA5YhVAAAAyhGrAAAAlCNWAQAAKEesAgAAUI5YBQAAoByxCgAAQDliFQAAgHLEKgAAAOWIVQAAAMoRqwAAAJQjVgEAAChHrAIAAFCOWAUAAKAcsQoAAEA5YhUAAIByxCoAAADliFUAAADKEasAAACUI1YBAAAoR6wCAABQjlgFAACgHLEKAABAOWIVAACAcsQqAAAA5YhVAAAAyhGrAAAAlCNWAQAAKEesAgAAUI5YBQAAoByxCgAAQDliFQAAgHLEKgAAAOWIVQAAAMoRqwAAAJQjVgEAAChHrAIAAFCOWAUAAKAcsQoAAEA5YhUAAIByxCoAAADliFUAAADKEasAAACUI1YBAAAoR6wCAABQjlgFAACgHLEKAABAOWIVAACAcsQqAAAA5YhVAAAAyhGrAAAAlCNWAQAAKEesAgAAUI5YBQAAoByxCgAAQDliFQAAgHLEKgAAAOWIVQAAAMoRqwAAAJQjVgEAAChHrAIAAFCOWAUAAKAcsQoAAEA5zRNZtGrVqvT29mbhwoVZtmzZK14DAAAAEzHuM6sbNmxIo9HIypUr09fXly1btryiNQAAADBR4z6z2tXVlcWLFydJFixYkO7u7syaNetlr+ns7ExnZ2eSpKOjI7Nnz94jf4G95v99Z39PAMBBoPx++GL2RwD2kXGfWR0YGMiMGTOSJC0tLXnmmWde0Zr29vZ0dHSko6Pj1c7MfvKRj3xkf4/AHuA4HhgcxwOD43hgcBwPDI7jgcFxPLCMG6vTpk3L4OBgkmTHjh1pNBqvaA0AAABM1LixOm/evHR3dydJenp6MnPmzFe0BgAAACZq3FhdtGhR1q1bl9WrV2f9+vWZM2dO1qxZ85JrFi5cuNcGZv9pb2/f3yOwBziOBwbH8cDgOB4YHMcDg+N4YHAcDyxNw8PDw+Mt6u/vz6ZNm3LiiSemtbX1Fa8BAACAiZhQrAIAAMC+NO6vrgFg33r66adz7bXX5hOf+MSY92/fvj3XXXddGo1Gpk6dmssvvzxNTU35wAc+kNe85jVJkj/6oz/K61//+n05NhyQxjsf77nnntx3331Jkm3btuVNb3pTLr74Yucj7GFj7X3Nzc3jrrE/Tm6eWSVJsmrVqvT29mbhwoVZtmzZqPttxpPDeP+oSsY+1uMdf/ad/v7+/PVf/3WeffbZfOpTnxpzzd13351Zs2blpJNOyhe+8IWcfPLJmTFjRu677768+93v3scTszvjnY87d+4c82eo87GOiZyPP+/mm2/OW9/61jQ1NTkfC5lI5CT2x+rG2vtOOeWUcdfYHye3cd9giQPfhg0b0mg0snLlyvT19WXLli2j1ixdujQrVqzIihUrcsIJJ+Rtb3tbenp6ctppp43cLlT3r/7+/nz2s5/NwMDAbteMdawncvzZdw455JBcfvnlOfzww3e75owzzshJJ52UJHn22Wdz1FFH5aGHHsr999+fj370o1m1alV27ty5r0ZmDBM5H8f6Gep8rGUi5+P/+clPfpKnn346xx13nPOxmHXr1uWcc87J1VdfndbW1mzcuHHUGvtjfWPtfRNZ43yc3MQq6erqyuLFi5MkCxYsGPk1RGOxGdc1kX9UjXWsX87xZ++bPn16pk+fPqG1P/jBD7Jt27Ycf/zxOe6443LNNdfkk5/8ZHbu3Jn/+q//2suT8lImcj6O9TPU+VjLyzkf77rrrixdujRJnI/FTCRy7I+Tx8/vfRNZ43yc3MQqGRgYyIwZM5IkLS0teeaZZ3a71mZc10T+UTXWsX45x586+vv7c/PNN+d973tfkuQNb3hDjj766CQv/O5rzwDsXxM5H8f6Gep8nJwajUa6uroyf/78JM7Hql4qcuyPk8OL976JrHE+Tm5ilUybNi2Dg4NJkh07dqTRaIy5zmY8+Y11rCd6/KljaGgo1157bS688MIce+yxSZLPfOYzeeSRR9JoNPLtb387b3jDG/bzlIxnrJ+hzsfJqbu7O29605vS1NSUxPlY0XiRY3+sb6y9byJrnI+Tm1gl8+bNG7m0paenJzNnzhxznc148hvrWE/0+LN/9Pb2Zs2aNbvcdu+992bz5s25/fbbs2LFitx3330577zzcsMNN+TDH/5wjj/++JFL3qhrrJ+hzsfaxjofk2Tjxo054YQTRj52PtYykcixP9b34r3vtttusz8eBLwbMNm+fXuWL1+eX/zFX8zGjRtz2WWX5Vvf+lYuuOCCXdb9wz/8Q4477riceuqpSZJHH300119/fYaHh3PKKafkXe961/4Ynxf5vzdr6e3tzTe/+c1djuOLj/XKlSuTZNRtE32NFvDSXup8HOtn6FjnqPMRXp177rkn//iP/zjyn+rz58/Pzp077Y8wCYhVkrxwecymTZty4oknprW1dX+Pw1401rF2/KEO5yPsH/ZHqEesAgAAUI7XrAIAAFCOWAUAAKAcsQoAAEA5YhUAAIByxCoAAADl/H8Gjbza+GFhuwAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<Figure size 1164.96x720 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "plt.figure(figsize=(16.18,10))\n",
    "plt.bar(a.index, a.values)\n",
    "plt.title('年龄分布')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 266,
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'age_seg' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-266-08ccb0c19eee>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[1;31m# 年龄 + 性别  2：女  1：男\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[0mpd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcrosstab\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mage_seg\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mDemographic_Background\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'ba000_w2_3'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[1;31mNameError\u001b[0m: name 'age_seg' is not defined"
     ]
    }
   ],
   "source": [
    "# 年龄 + 性别  2：女  1：男\n",
    "pd.crosstab(age_seg,Demographic_Background['ba000_w2_3'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 267,
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'age_seg' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-267-f364657d075f>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[1;31m# 年龄 + 婚姻情况\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      2\u001b[0m \u001b[1;31m# 1：已婚与配偶一同居住； 2：已婚，但因为工作等原因暂时没有跟配偶一起居住；3：分居； 4：离异； 5：丧偶； 6：从未结婚\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 3\u001b[1;33m \u001b[0mpd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcrosstab\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mage_seg\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mDemographic_Background\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'be001'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[1;31mNameError\u001b[0m: name 'age_seg' is not defined"
     ]
    }
   ],
   "source": [
    "# 年龄 + 婚姻情况 \n",
    "# 1：已婚与配偶一同居住； 2：已婚，但因为工作等原因暂时没有跟配偶一起居住；3：分居； 4：离异； 5：丧偶； 6：从未结婚\n",
    "pd.crosstab(age_seg,Demographic_Background['be001'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 312,
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'age_seg' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-312-294ed5dbf0bc>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[1;31m# 年龄 + 户口类型\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      2\u001b[0m \u001b[1;31m# 1：农业； 2：非农业； 3：统一居民户； 4：没有户口\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 3\u001b[1;33m \u001b[0mpd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcrosstab\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mage_seg\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mDemographic_Background\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'bc002_w3_2'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[1;31mNameError\u001b[0m: name 'age_seg' is not defined"
     ]
    }
   ],
   "source": [
    "# 年龄 + 户口类型\n",
    "# 1：农业； 2：非农业； 3：统一居民户； 4：没有户口\n",
    "pd.crosstab(age_seg,Demographic_Background['bc002_w3_2'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 3 家庭信息表  "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3.1. 父母、子女以及兄弟姐妹信息 Family_Information.dta"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- 父母信息\n",
    "    \n",
    "    [家庭受访者姓名]父母编码：1 亲生父亲，2 亲生母亲，3 养父，4 养母\n",
    "     [家庭受访者的配偶姓名]父母编码：5 亲生父亲，6 亲生母亲，7 养父，8 养母\n",
    "    - ca000_w4_1_1_：亲生父亲出生年份是否为zparbirth_1_； \n",
    "      - 1 Wrong则为ca001_w3_1_1_\n",
    "    - ca000_w3_2_1_：父亲是否建在\n",
    "    - ca000_w3_3_1_1_：去世年份；ca000_w3_3_2_1_：去世岁数\n",
    "    - ca000_w4_1_2_：亲生母亲出生年份是否为zparbirth_2_；\n",
    "      - 1 Wrong则为ca001_w3_1_2_\n",
    "    - ca000_w3_2_2_：母亲是否建在\n",
    "    - ca000_w3_3_1_2_：去世年份；ca000_w3_3_2_2_：去世岁数   \n",
    "\n",
    "    - ca009_w4_i：最高学历是否是zparedu_i； \n",
    "      - 2 Wrong则为ca009_i\n",
    "    - ca014_w4_i：职位最高的职业是否是zparocc_1_i；\n",
    "      - 2 No则为ca014_i\n",
    "    - ca023_w3_i：是否是共产党员\n",
    "    - ca016_i：现在一般住在哪里\n",
    "    \n",
    "    \n",
    "- 子女信息\n",
    "    - cb050_w3：子女（亲生、继子女和养子女）\n",
    "    - cb052_w4_i：不包括成人教育的最高学历是否是zchildedu_i；\n",
    "      - 2 Wrong则为cb052_w3_i\n",
    "    - cb053_i：现在一般住哪里 \n",
    "    - cb055_i：目前的户口类型 \n",
    "    - cb057_i：目前的户口所在地\n",
    "    - cb063_w3_2_i：是否是共产党员\n",
    "    - cb063_w4_6_i：信仰哪个宗教\n",
    "    - cb070_w4_i：在上学还是工作？；\n",
    "      - cb071_i：现在主要做什么工作；\n",
    "      - cb059_w4_i：现在在上什么学\n",
    "    - cb063_i：婚姻状况\n",
    "    - cb063_w3_1_i：身体状况\n",
    "\n",
    "\n",
    "- 兄弟姐妹信息\n",
    "    - cc000_w4_1：一共有多少兄弟姐妹（包括继兄弟姐妹和养兄弟姐妹）\n",
    "    - cc003_w4_1_i：不包括成人教育的最高学历是否是zsibedu_i；\n",
    "      - 2 Wrong则为cc003_w3_i\n",
    "    - cc015_w3_i：职位最高的职业是什么\n",
    "    - cc004_w3_i：是否是共产党员\n",
    "    - cc011_w3_i：婚姻状况\n",
    "    - cc012_w3_i：身体状况\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 270,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_Family_Information = pd.read_stata(file_path_Family_Information, convert_categoricals=False) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 271,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(11628, 1539)"
      ]
     },
     "execution_count": 271,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_Family_Information.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 272,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID</th>\n",
       "      <th>householdID</th>\n",
       "      <th>communityID</th>\n",
       "      <th>cv009</th>\n",
       "      <th>ca000_w4_0_1_</th>\n",
       "      <th>ca000_w4_0_2_</th>\n",
       "      <th>ca000_w4_1_1_</th>\n",
       "      <th>ca000_w4_1_2_</th>\n",
       "      <th>ca000_w3_1_1_</th>\n",
       "      <th>ca000_w3_1_2_</th>\n",
       "      <th>ca000_w3_2_1_</th>\n",
       "      <th>ca000_w3_2_2_</th>\n",
       "      <th>ca000_w3_3_1_1_</th>\n",
       "      <th>ca000_w3_3_1_2_</th>\n",
       "      <th>ca000_w3_3_2_1_</th>\n",
       "      <th>ca000_w3_3_2_2_</th>\n",
       "      <th>ca001_w4_0_1_</th>\n",
       "      <th>ca001_w4_0_2_</th>\n",
       "      <th>ca001_w4_1_1_</th>\n",
       "      <th>ca001_w4_1_2_</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>094004113002</td>\n",
       "      <td>0940041130</td>\n",
       "      <td>0940041</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2005.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>094004111001</td>\n",
       "      <td>0940041110</td>\n",
       "      <td>0940041</td>\n",
       "      <td>2.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>1997.0</td>\n",
       "      <td>2011.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>094004112001</td>\n",
       "      <td>0940041120</td>\n",
       "      <td>0940041</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1996.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>094004114001</td>\n",
       "      <td>0940041140</td>\n",
       "      <td>0940041</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1918.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1985.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>094004118001</td>\n",
       "      <td>0940041180</td>\n",
       "      <td>0940041</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>1964.0</td>\n",
       "      <td>2003.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             ID householdID communityID  cv009  ca000_w4_0_1_  ca000_w4_0_2_  \\\n",
       "0  094004113002  0940041130     0940041    NaN              1            NaN   \n",
       "1  094004111001  0940041110     0940041    2.0              1            1.0   \n",
       "2  094004112001  0940041120     0940041    NaN              1            NaN   \n",
       "3  094004114001  0940041140     0940041    NaN              1            NaN   \n",
       "4  094004118001  0940041180     0940041    1.0              1            1.0   \n",
       "\n",
       "   ca000_w4_1_1_  ca000_w4_1_2_  ca000_w3_1_1_  ca000_w3_1_2_  ca000_w3_2_1_  \\\n",
       "0            1.0            NaN            NaN            NaN            2.0   \n",
       "1            1.0            1.0            NaN            NaN            2.0   \n",
       "2            1.0            NaN            NaN            NaN            2.0   \n",
       "3            NaN            NaN         1918.0            NaN            2.0   \n",
       "4            1.0            1.0            NaN            NaN            2.0   \n",
       "\n",
       "   ca000_w3_2_2_  ca000_w3_3_1_1_  ca000_w3_3_1_2_  ca000_w3_3_2_1_  \\\n",
       "0            NaN           2005.0              NaN              NaN   \n",
       "1            2.0           1997.0           2011.0              NaN   \n",
       "2            NaN           1996.0              NaN              NaN   \n",
       "3            NaN           1985.0              NaN              NaN   \n",
       "4            2.0           1964.0           2003.0              NaN   \n",
       "\n",
       "   ca000_w3_3_2_2_  ca001_w4_0_1_  ca001_w4_0_2_  ca001_w4_1_1_  ca001_w4_1_2_  \n",
       "0              NaN            1.0            NaN            1.0            NaN  \n",
       "1              NaN            1.0            1.0            1.0            1.0  \n",
       "2              NaN            1.0            NaN            1.0            NaN  \n",
       "3              NaN            1.0            NaN            NaN            NaN  \n",
       "4              NaN            1.0            1.0            1.0            1.0  "
      ]
     },
     "execution_count": 272,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_Family_Information.iloc[0:5,0:20]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 281,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ID                    0\n",
       "householdID           0\n",
       "communityID           0\n",
       "cv009              2929\n",
       "ca000_w4_0_1_         0\n",
       "                  ...  \n",
       "zsibedu_s_12_     11623\n",
       "zsibedu_s_13_     11626\n",
       "zsibedu_s_14_     11627\n",
       "hhmembernumber       30\n",
       "versionID             0\n",
       "Length: 1539, dtype: int64"
      ]
     },
     "execution_count": 281,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# check 缺失值； 等具体问题分析再处理数据清洗问题\n",
    "data_Family_Information.isna().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3.2. 家庭交往与经济帮助 Family_Transfer.dta"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- 与父母、子女间的交往 \n",
    "- 与父母的交往\n",
    "    - cd001_w4_i：父母现在和谁住在一起\n",
    "    - cd002_w4_i：不住一起时多长时间看望一次\n",
    "- 与子女的交往    \n",
    "    - cd003_w4_i：与子女一起居住的时间\n",
    "    - cd003_i：不住一起时多长时间见一次\n",
    "    - cd004_i：不住一起时多长时间通过电话、短信、微信、信件或电子邮件联系一次\n",
    "\n",
    "\n",
    "- 家庭得到及提供的经济帮助 \n",
    "- 与父母间的经济支持\n",
    "    - ce002_1：不住一起时父母总共给钱的数目；\n",
    "    - ce002_2：父母定期给钱（生活费、水电费及电话费、房贷、房租费）的数目\n",
    "    - ce002_3：父母总共给物的钱数；\n",
    "    - ce002_4：父母定期给物（粮食、买菜、买衣服）的钱数\n",
    "    - ce022_1：不住一起时总共给父母钱的数目；\n",
    "    - ce022_2：定期给父母钱（生活费、水电费及电话费、房贷、房租费）的数目\n",
    "    - ce022_3：总共给父母物的钱数；\n",
    "    - ce022_4：定期给父母物（粮食、买菜、买衣服）的钱数\n",
    "- 与子女间的经济支持    \n",
    "    - ce009_1：不住一起时子女总共给钱的数目；\n",
    "    - ce009_2：子女定期给钱（生活费、水电费及电话费、房贷、房租费）的数目\n",
    "    - ce009_3：子女总共给物的钱数；\n",
    "    - ce009_4：子女定期给物（粮食、买菜、买衣服）的钱数\n",
    "    - ce029_1：不住一起时总共给子女钱的数目；\n",
    "    - ce029_2：定期给子女钱（生活费、水电费及电话费、房贷、房租费）的数目\n",
    "    - ce029_3：总共给子女物的钱数；\n",
    "    - ce029_4：定期给子女物（粮食、买菜、买衣服）的钱数\n",
    "- 与兄弟姐妹间的经济支持    \n",
    "    - ce072_w2_1：从兄弟姐妹那里收到的总钱数（婚丧嫁娶、搬迁新房、新生儿、子女升学等情况下的随礼以及生病、生活困难等情况下的经济资助）\n",
    "    - ce072_w2_2：兄弟姐妹定期给钱（生活费、水电费及电话费、房贷、房租费）的数目\n",
    "    - ce072_w2_3：兄弟姐妹总共给物的钱数；\n",
    "    - ce072_w2_4：兄弟姐妹定期给物（粮食、买菜、买衣服）的钱数\n",
    "    - ce074_w2_1：总共给兄弟姐妹钱的数目；\n",
    "    - ce074_w2_2：定期给兄弟姐妹钱（生活费、水电费及电话费、房贷、房租费）的数目\n",
    "    - ce074_w2_3：总共给兄弟姐妹物的钱数；\n",
    "    - ce074_w2_4：定期给兄弟姐妹物（粮食、买菜、买衣服）的钱数   \n",
    "- 与其他亲戚朋友间的经济支持    \n",
    "    - ce016_w4：因为婚丧嫁娶、搬迁新房、新生儿、子女升学等办酒席花费的钱数。\n",
    "      - 没有为0、不知道或拒绝回答为-1\n",
    "    - ce016_w3：过去一年从亲戚朋友那里收到的礼金数目（婚丧嫁娶、搬迁新房、新生儿、子女升学等情况下的随礼）\n",
    "    - ce036_w3：过去一年给亲戚朋友的礼金数目\n",
    "    - ce016：过去一年除礼金外，从亲戚朋友那里收到的钱物数目（包括生病、生活困难等情况下的经济资助，不包括借钱）\n",
    "    - ce036：过去一年除礼金外，给亲戚朋友的钱物数目数目\n",
    "    \n",
    "\n",
    "- 提供照料时间\n",
    "    - 过去一年，我大概花cf003_1周，一周cf003_2小时照看子女的孩子；我爱人大概花cf003_3周，一周cf003_4小时照看子女的孩子。\n",
    "      - 没有照料为0、不足一周为1、一周不足一小时为1\n",
    "    - cf004_w4：是否在日常活动方面给父母提供帮助（家务劳动、做饭、洗衣、外出、购物和财务管理）\n",
    "      - 1 Yes，过去一年家庭受访者大约花cf005_w4_1周，一周cf005_w4_2小时照看父母\n",
    "      - 家庭受访者的配偶大约花cf006_w4_1周，一周cf006_w4_2小时照看父母\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 277,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_Family_Transfer = pd.read_stata(file_path_Family_Transfer, convert_categoricals=False) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 278,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(11568, 1120)"
      ]
     },
     "execution_count": 278,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_Family_Transfer.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 282,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID</th>\n",
       "      <th>householdID</th>\n",
       "      <th>communityID</th>\n",
       "      <th>cd001_w4_1_</th>\n",
       "      <th>cd001_w4_2_</th>\n",
       "      <th>cd001_w4_3_</th>\n",
       "      <th>cd001_w4_4_</th>\n",
       "      <th>cd001_w4_5_</th>\n",
       "      <th>cd001_w4_6_</th>\n",
       "      <th>cd001_w4_7_</th>\n",
       "      <th>cd001_w4_8_</th>\n",
       "      <th>cd001_w3_1_1__s1</th>\n",
       "      <th>cd001_w3_1_1__s2</th>\n",
       "      <th>cd001_w3_1_1__s3</th>\n",
       "      <th>cd001_w3_1_1__s4</th>\n",
       "      <th>cd001_w3_1_1__s5</th>\n",
       "      <th>cd001_w3_1_1__s6</th>\n",
       "      <th>cd001_w3_1_1__s7</th>\n",
       "      <th>cd001_w3_1_1__s8</th>\n",
       "      <th>cd001_w3_1_1__s9</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>094004113002</td>\n",
       "      <td>0940041130</td>\n",
       "      <td>0940041</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>094004111001</td>\n",
       "      <td>0940041110</td>\n",
       "      <td>0940041</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>094004112001</td>\n",
       "      <td>0940041120</td>\n",
       "      <td>0940041</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>094004114001</td>\n",
       "      <td>0940041140</td>\n",
       "      <td>0940041</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>094004118001</td>\n",
       "      <td>0940041180</td>\n",
       "      <td>0940041</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             ID householdID communityID  cd001_w4_1_  cd001_w4_2_  \\\n",
       "0  094004113002  0940041130     0940041          NaN          NaN   \n",
       "1  094004111001  0940041110     0940041          NaN          NaN   \n",
       "2  094004112001  0940041120     0940041          NaN          NaN   \n",
       "3  094004114001  0940041140     0940041          NaN          NaN   \n",
       "4  094004118001  0940041180     0940041          NaN          NaN   \n",
       "\n",
       "   cd001_w4_3_  cd001_w4_4_  cd001_w4_5_  cd001_w4_6_  cd001_w4_7_  \\\n",
       "0          NaN          NaN          NaN          NaN          NaN   \n",
       "1          NaN          NaN          NaN          NaN          NaN   \n",
       "2          NaN          NaN          NaN          NaN          NaN   \n",
       "3          NaN          NaN          NaN          NaN          NaN   \n",
       "4          NaN          NaN          NaN          NaN          NaN   \n",
       "\n",
       "   cd001_w4_8_  cd001_w3_1_1__s1  cd001_w3_1_1__s2  cd001_w3_1_1__s3  \\\n",
       "0          NaN               NaN               NaN               NaN   \n",
       "1          NaN               NaN               NaN               NaN   \n",
       "2          NaN               NaN               NaN               NaN   \n",
       "3          NaN               NaN               NaN               NaN   \n",
       "4          NaN               NaN               NaN               NaN   \n",
       "\n",
       "   cd001_w3_1_1__s4  cd001_w3_1_1__s5  cd001_w3_1_1__s6  cd001_w3_1_1__s7  \\\n",
       "0               NaN               NaN               NaN               NaN   \n",
       "1               NaN               NaN               NaN               NaN   \n",
       "2               NaN               NaN               NaN               NaN   \n",
       "3               NaN               NaN               NaN               NaN   \n",
       "4               NaN               NaN               NaN               NaN   \n",
       "\n",
       "   cd001_w3_1_1__s8  cd001_w3_1_1__s9  \n",
       "0               NaN               NaN  \n",
       "1               NaN               NaN  \n",
       "2               NaN               NaN  \n",
       "3               NaN               NaN  \n",
       "4               NaN               NaN  "
      ]
     },
     "execution_count": 282,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_Family_Transfer.iloc[0:5,0:20].iloc[0:5,0:20]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 283,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ID                   0\n",
       "householdID          0\n",
       "communityID          0\n",
       "cd001_w4_1_      10130\n",
       "cd001_w4_2_       9833\n",
       "                 ...  \n",
       "cf006_w4_2_5_    10967\n",
       "cf006_w4_2_6_    10962\n",
       "cf006_w4_2_7_    11561\n",
       "cf006_w4_2_8_    11560\n",
       "versionID            0\n",
       "Length: 1120, dtype: int64"
      ]
     },
     "execution_count": 283,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# check 缺失值； 等具体问题分析再处理数据清洗问题\n",
    "data_Family_Transfer.isna().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 4 健康信息表  "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4.1. 健康状况和功能信息 Health_Status_and_Functioning.dta"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- 健康状况\n",
    "- 一般健康状况和疾病史\n",
    "    - da002：健康状况\n",
    "      - 回访者与之前相比时的状态：da002_w2_1\n",
    "    - da005：是否有残疾（躯体残疾、大脑受损、失明、聋、哑）问题\n",
    "    - da006：开始患有的年份\n",
    "    - da007：是否有医生曾告诉过患有慢性病（高血压、血脂异常、糖尿病等）\n",
    "    - da009_1：第一次知道患有慢性病的年份、岁数da009_2\n",
    "    - da010_w4：正在采用治疗慢性病的治疗方法\n",
    "    - da048：15岁之前（包括15岁）的身体状况\n",
    "    \n",
    "    \n",
    "- 生活方式和健康行为\n",
    "    - da049：过去一个月内平均每天晚上真正睡着的小时数\n",
    "    - da050：过去一个月内的午睡时间（分钟）\n",
    "    - da051：是否每周至少持续体力活动（有氧运动、快速骑车，拖地、打太极，散步）十分钟\n",
    "      - 1 Yes\n",
    "        - da052：每周至少做体力活动十分钟的天数\n",
    "        - da053：一天做体力活动的时间是否大于2小时；\n",
    "          - da054：是否小于30分钟；- da055：是否小于4小时\n",
    "    - da051_1：做体力活动的原因\n",
    "    - da056：过去一个月进行的社交活动\n",
    "      - 10 上网\n",
    "        - da056_w4_1：上网的目的\n",
    "        - da056_w4_2：是否会用手机支付（支付宝、微信）\n",
    "        - da056_w4_3：是否使用微信\n",
    "        - da056_w4_4：发不发微信朋友圈\n",
    "     - da059：是否吸过烟\n",
    "       - 1 Yes\n",
    "         - da061：现在是否还在吸烟\n",
    "         - da060：吸烟时一般抽什么烟\n",
    "     - da067：过去一年是否喝酒、喝酒频率\n",
    "         - da068：喝酒的种类\n",
    "         - da071_1：开始饮酒的年份、da071_2：开始饮酒的年龄  \n",
    "\n",
    "\n",
    "- 身体功能障碍以及辅助者\n",
    "    - db001：跑或慢跑1公里是否有困难\n",
    "      - 1 没有困难，2 有困难但仍可以完成，3 有困难、需要帮助，4 无法完成\n",
    "    - db002：走1公里是否有困难\n",
    "    - db003：走100米是否有困难\n",
    "    - db004：在椅子上坐时间久了再站起来是否有困难\n",
    "    - db005：连续不停地爬几层楼是否有困难\n",
    "    - db006：弯腰、屈膝或者下蹲是否有困难\n",
    "    - db007：手臂沿着肩向上伸展是否有困难\n",
    "    - db008：提10斤重的东西是否有困难\n",
    "    - db009：从桌上拿起一枚硬币是否有困难\n",
    "    - db016：是否因为健康和记忆的原因，做家务活（房屋清洁、洗碗盘，整理被褥和房间摆设）有困难\n",
    "      - db016_w2：做家务时是否有人帮助你\n",
    "    - db019：是否因为健康和记忆的原因，管钱（支付账单、记录支出项目、管理财务）有困难\n",
    "      - db019_w2：是否有人帮助你管钱\n",
    "    - db022_w3_1：谁在困难中帮助你\n",
    "      - 帮助了db023天，每天帮助db024小时\n",
    " "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 284,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_Health_Status_and_Functioning = pd.read_stata(file_path_Health_Status_and_Functioning, convert_categoricals=False) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 285,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(19752, 896)"
      ]
     },
     "execution_count": 285,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_Health_Status_and_Functioning.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 286,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID</th>\n",
       "      <th>householdID</th>\n",
       "      <th>communityID</th>\n",
       "      <th>da002</th>\n",
       "      <th>da002_w2_1</th>\n",
       "      <th>da005_1_</th>\n",
       "      <th>da005_2_</th>\n",
       "      <th>da005_3_</th>\n",
       "      <th>da005_4_</th>\n",
       "      <th>da005_5_</th>\n",
       "      <th>da006_1_</th>\n",
       "      <th>da006_2_</th>\n",
       "      <th>da006_3_</th>\n",
       "      <th>da006_4_</th>\n",
       "      <th>da006_5_</th>\n",
       "      <th>da006_w4_1_</th>\n",
       "      <th>da006_w4_2_</th>\n",
       "      <th>da006_w4_3_</th>\n",
       "      <th>da006_w4_4_</th>\n",
       "      <th>da006_w4_5_</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>094004113002</td>\n",
       "      <td>0940041130</td>\n",
       "      <td>0940041</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>094004111002</td>\n",
       "      <td>0940041110</td>\n",
       "      <td>0940041</td>\n",
       "      <td>4.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>094004111001</td>\n",
       "      <td>0940041110</td>\n",
       "      <td>0940041</td>\n",
       "      <td>4.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>094004112001</td>\n",
       "      <td>0940041120</td>\n",
       "      <td>0940041</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>094004118001</td>\n",
       "      <td>0940041180</td>\n",
       "      <td>0940041</td>\n",
       "      <td>5.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             ID householdID communityID  da002  da002_w2_1  da005_1_  \\\n",
       "0  094004113002  0940041130     0940041    1.0         2.0       2.0   \n",
       "1  094004111002  0940041110     0940041    4.0         2.0       2.0   \n",
       "2  094004111001  0940041110     0940041    4.0         2.0       2.0   \n",
       "3  094004112001  0940041120     0940041    1.0         2.0       NaN   \n",
       "4  094004118001  0940041180     0940041    5.0         3.0       NaN   \n",
       "\n",
       "   da005_2_  da005_3_  da005_4_  da005_5_  da006_1_  da006_2_  da006_3_  \\\n",
       "0       2.0       2.0       2.0       2.0       NaN       NaN       NaN   \n",
       "1       2.0       2.0       2.0       2.0       NaN       NaN       NaN   \n",
       "2       2.0       2.0       2.0       2.0       NaN       NaN       NaN   \n",
       "3       2.0       NaN       2.0       2.0       NaN       NaN       NaN   \n",
       "4       2.0       2.0       2.0       2.0       NaN       NaN       NaN   \n",
       "\n",
       "   da006_4_  da006_5_  da006_w4_1_  da006_w4_2_  da006_w4_3_  da006_w4_4_  \\\n",
       "0       NaN       NaN          NaN          NaN          NaN          NaN   \n",
       "1       NaN       NaN          NaN          NaN          NaN          NaN   \n",
       "2       NaN       NaN          NaN          NaN          NaN          NaN   \n",
       "3       NaN       NaN          NaN          NaN          NaN          NaN   \n",
       "4       NaN       NaN          NaN          NaN          NaN          NaN   \n",
       "\n",
       "   da006_w4_5_  \n",
       "0          NaN  \n",
       "1          NaN  \n",
       "2          NaN  \n",
       "3          NaN  \n",
       "4          NaN  "
      ]
     },
     "execution_count": 286,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_Health_Status_and_Functioning.iloc[0:5,0:20].iloc[0:5,0:20]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 287,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ID                 0\n",
       "householdID        0\n",
       "communityID        0\n",
       "da002           1474\n",
       "da002_w2_1       338\n",
       "               ...  \n",
       "zda065         12185\n",
       "zda069          6226\n",
       "zda070         18251\n",
       "zda071         13662\n",
       "versionID          0\n",
       "Length: 896, dtype: int64"
      ]
     },
     "execution_count": 287,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# check 缺失值； 等具体问题分析再处理数据清洗问题\n",
    "data_Health_Status_and_Functioning.isna().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4.2. 认知和抑郁信息 Cognition.dta"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4.3. 知情人信息健康状况和功能信息 Insider.dta"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
